diff options
Diffstat (limited to 'net')
372 files changed, 15317 insertions, 10024 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index faf65baed617..34e44c0c0836 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -20,7 +20,7 @@ int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { - int ret; + int i, ret; dev->addr_len = EUI64_ADDR_LEN; dev->type = ARPHRD_6LOWPAN; @@ -29,6 +29,10 @@ int lowpan_register_netdevice(struct net_device *dev, lowpan_priv(dev)->lltype = lltype; + spin_lock_init(&lowpan_priv(dev)->ctx.lock); + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + lowpan_priv(dev)->ctx.table[i].id = i; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -68,6 +72,32 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +static int lowpan_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int i; + + if (dev->type != ARPHRD_6LOWPAN) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_DOWN: + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, + &lowpan_priv(dev)->ctx.table[i].flags); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block lowpan_notifier = { + .notifier_call = lowpan_event, +}; + static int __init lowpan_module_init(void) { int ret; @@ -76,6 +106,12 @@ static int __init lowpan_module_init(void) if (ret < 0) return ret; + ret = register_netdevice_notifier(&lowpan_notifier); + if (ret < 0) { + lowpan_debugfs_exit(); + return ret; + } + request_module_nowait("ipv6"); request_module_nowait("nhc_dest"); @@ -92,6 +128,7 @@ static int __init lowpan_module_init(void) static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); + unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 88eef84df0fc..aa49ff4ce6fd 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -16,19 +16,266 @@ #include "6lowpan_i.h" +#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS 8 + static struct dentry *lowpan_debugfs; +static int lowpan_ctx_flag_active_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + + if (val != 0 && val != 1) + return -EINVAL; + + if (val) + set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); + else + clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); + + return 0; +} + +static int lowpan_ctx_flag_active_get(void *data, u64 *val) +{ + *val = lowpan_iphc_ctx_is_active(data); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops, + lowpan_ctx_flag_active_get, + lowpan_ctx_flag_active_set, "%llu\n"); + +static int lowpan_ctx_flag_c_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + + if (val != 0 && val != 1) + return -EINVAL; + + if (val) + set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); + else + clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); + + return 0; +} + +static int lowpan_ctx_flag_c_get(void *data, u64 *val) +{ + *val = lowpan_iphc_ctx_is_compression(data); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get, + lowpan_ctx_flag_c_set, "%llu\n"); + +static int lowpan_ctx_plen_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + if (val > 128) + return -EINVAL; + + spin_lock_bh(&t->lock); + ctx->plen = val; + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_ctx_plen_get(void *data, u64 *val) +{ + struct lowpan_iphc_ctx *ctx = data; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + spin_lock_bh(&t->lock); + *val = ctx->plen; + spin_unlock_bh(&t->lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get, + lowpan_ctx_plen_set, "%llu\n"); + +static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset) +{ + struct lowpan_iphc_ctx *ctx = file->private; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + spin_lock_bh(&t->lock); + seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(ctx->pfx.s6_addr16[0]), + be16_to_cpu(ctx->pfx.s6_addr16[1]), + be16_to_cpu(ctx->pfx.s6_addr16[2]), + be16_to_cpu(ctx->pfx.s6_addr16[3]), + be16_to_cpu(ctx->pfx.s6_addr16[4]), + be16_to_cpu(ctx->pfx.s6_addr16[5]), + be16_to_cpu(ctx->pfx.s6_addr16[6]), + be16_to_cpu(ctx->pfx.s6_addr16[7])); + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file) +{ + return single_open(file, lowpan_ctx_pfx_show, inode->i_private); +} + +static ssize_t lowpan_ctx_pfx_write(struct file *fp, + const char __user *user_buf, size_t count, + loff_t *ppos) +{ + char buf[128] = {}; + struct seq_file *file = fp->private_data; + struct lowpan_iphc_ctx *ctx = file->private; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + int status = count, n, i; + unsigned int addr[8]; + + if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1, + count))) { + status = -EFAULT; + goto out; + } + + n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + &addr[0], &addr[1], &addr[2], &addr[3], &addr[4], + &addr[5], &addr[6], &addr[7]); + if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) { + status = -EINVAL; + goto out; + } + + spin_lock_bh(&t->lock); + for (i = 0; i < 8; i++) + ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff); + spin_unlock_bh(&t->lock); + +out: + return status; +} + +const struct file_operations lowpan_ctx_pfx_fops = { + .open = lowpan_ctx_pfx_open, + .read = seq_read, + .write = lowpan_ctx_pfx_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, + struct dentry *ctx, u8 id) +{ + struct lowpan_priv *lpriv = lowpan_priv(dev); + struct dentry *dentry, *root; + char buf[32]; + + WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); + + sprintf(buf, "%d", id); + + root = debugfs_create_dir(buf, ctx); + if (!root) + return -EINVAL; + + dentry = debugfs_create_file("active", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_flag_active_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("compression", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_flag_c_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("prefix", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_pfx_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("prefix_len", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_plen_fops); + if (!dentry) + return -EINVAL; + + return 0; +} + +static int lowpan_context_show(struct seq_file *file, void *offset) +{ + struct lowpan_iphc_ctx_table *t = file->private; + int i; + + seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C'); + seq_puts(file, "-------------------------------------------------\n"); + + spin_lock_bh(&t->lock); + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + if (!lowpan_iphc_ctx_is_active(&t->table[i])) + continue; + + seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id, + &t->table[i].pfx, t->table[i].plen, + lowpan_iphc_ctx_is_compression(&t->table[i])); + } + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_context_open(struct inode *inode, struct file *file) +{ + return single_open(file, lowpan_context_show, inode->i_private); +} + +const struct file_operations lowpan_context_fops = { + .open = lowpan_context_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + int lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_priv *lpriv = lowpan_priv(dev); + struct dentry *contexts, *dentry; + int ret, i; /* creating the root */ lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); if (!lpriv->iface_debugfs) goto fail; + contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs); + if (!contexts) + goto remove_root; + + dentry = debugfs_create_file("show", 0644, contexts, + &lowpan_priv(dev)->ctx, + &lowpan_context_fops); + if (!dentry) + goto remove_root; + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i); + if (ret < 0) + goto remove_root; + } + return 0; +remove_root: + lowpan_dev_debugfs_exit(dev); fail: return -EINVAL; } diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 346b5c1a9185..72172514fea0 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -56,6 +56,7 @@ /* special link-layer handling */ #include <net/mac802154.h> +#include "6lowpan_i.h" #include "nhc.h" /* Values of fields within the IPHC encoding first byte */ @@ -147,6 +148,9 @@ (((a)->s6_addr16[6]) == 0) && \ (((a)->s6_addr[14]) == 0)) +#define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f) +#define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4) + static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, const void *lladdr) { @@ -195,6 +199,98 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, } } +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id) +{ + struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id]; + + if (!lowpan_iphc_ctx_is_active(ret)) + return NULL; + + return ret; +} + +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_addr(const struct net_device *dev, + const struct in6_addr *addr) +{ + struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *ret = NULL; + struct in6_addr addr_pfx; + u8 addr_plen; + int i; + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + /* Check if context is valid. A context that is not valid + * MUST NOT be used for compression. + */ + if (!lowpan_iphc_ctx_is_active(&table[i]) || + !lowpan_iphc_ctx_is_compression(&table[i])) + continue; + + ipv6_addr_prefix(&addr_pfx, addr, table[i].plen); + + /* if prefix len < 64, the remaining bits until 64th bit is + * zero. Otherwise we use table[i]->plen. + */ + if (table[i].plen < 64) + addr_plen = 64; + else + addr_plen = table[i].plen; + + if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) { + /* remember first match */ + if (!ret) { + ret = &table[i]; + continue; + } + + /* get the context with longest prefix len */ + if (table[i].plen > ret->plen) + ret = &table[i]; + } + } + + return ret; +} + +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, + const struct in6_addr *addr) +{ + struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *ret = NULL; + struct in6_addr addr_mcast, network_pfx = {}; + int i; + + /* init mcast address with */ + memcpy(&addr_mcast, addr, sizeof(*addr)); + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + /* Check if context is valid. A context that is not valid + * MUST NOT be used for compression. + */ + if (!lowpan_iphc_ctx_is_active(&table[i]) || + !lowpan_iphc_ctx_is_compression(&table[i])) + continue; + + /* setting plen */ + addr_mcast.s6_addr[3] = table[i].plen; + /* get network prefix to copy into multicast address */ + ipv6_addr_prefix(&network_pfx, &table[i].pfx, + table[i].plen); + /* setting network prefix */ + memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8); + + if (ipv6_addr_equal(addr, &addr_mcast)) { + ret = &table[i]; + break; + } + } + + return ret; +} + /* Uncompress address function for source and * destination address(non-multicast). * @@ -259,30 +355,59 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, /* Uncompress address function for source context * based address(non-multicast). */ -static int uncompress_context_based_src_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - u8 address_mode) +static int uncompress_ctx_addr(struct sk_buff *skb, + const struct net_device *dev, + const struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, u8 address_mode, + const void *lladdr) { + bool fail; + switch (address_mode) { - case LOWPAN_IPHC_SAM_00: - /* unspec address :: + /* SAM and DAM are the same here */ + case LOWPAN_IPHC_DAM_00: + fail = false; + /* SAM_00 -> unspec address :: * Do nothing, address is already :: + * + * DAM 00 -> reserved should never occur. */ break; case LOWPAN_IPHC_SAM_01: - /* TODO */ + case LOWPAN_IPHC_DAM_01: + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; case LOWPAN_IPHC_SAM_10: - /* TODO */ + case LOWPAN_IPHC_DAM_10: + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; case LOWPAN_IPHC_SAM_11: - /* TODO */ - netdev_warn(skb->dev, "SAM value 0x%x not supported\n", - address_mode); - return -EINVAL; + case LOWPAN_IPHC_DAM_11: + fail = false; + switch (lowpan_priv(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + iphc_uncompress_802154_lladdr(ipaddr, lladdr); + break; + default: + iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + break; + } + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; default: pr_debug("Invalid sam value: 0x%x\n", address_mode); return -EINVAL; } + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; + } + raw_dump_inline(NULL, "Reconstructed context based ipv6 src addr is", ipaddr->s6_addr, 16); @@ -346,6 +471,30 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, return 0; } +static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb, + struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, + u8 address_mode) +{ + struct in6_addr network_pfx = {}; + bool fail; + + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4); + if (fail) + return -EIO; + + /* take prefix_len and network prefix from the context */ + ipaddr->s6_addr[3] = ctx->plen; + /* get network prefix to copy into multicast address */ + ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen); + /* setting network prefix */ + memcpy(&ipaddr->s6_addr[4], &network_pfx, 8); + + return 0; +} + /* get the ecn values from iphc tf format and set it to ipv6hdr */ static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf) { @@ -459,7 +608,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, const void *daddr, const void *saddr) { struct ipv6hdr hdr = {}; - u8 iphc0, iphc1; + struct lowpan_iphc_ctx *ci; + u8 iphc0, iphc1, cid = 0; int err; raw_dump_table(__func__, "raw skb data dump uncompressed", @@ -469,12 +619,14 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1))) return -EINVAL; - /* another if the CID flag is set */ - if (iphc1 & LOWPAN_IPHC_CID) - return -ENOTSUPP; - hdr.version = 6; + /* default CID = 0, another if the CID flag is set */ + if (iphc1 & LOWPAN_IPHC_CID) { + if (lowpan_fetch_skb(skb, &cid, sizeof(cid))) + return -EINVAL; + } + err = lowpan_iphc_tf_decompress(skb, &hdr, iphc0 & LOWPAN_IPHC_TF_MASK); if (err < 0) @@ -500,10 +652,17 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, } if (iphc1 & LOWPAN_IPHC_SAC) { - /* Source address context based uncompression */ + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; + } + pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_context_based_src_addr(skb, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK); + err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); @@ -515,27 +674,52 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, if (err) return -EINVAL; - /* check for Multicast Compression */ - if (iphc1 & LOWPAN_IPHC_M) { - if (iphc1 & LOWPAN_IPHC_DAC) { - pr_debug("dest: context-based mcast compression\n"); - /* TODO: implement this */ - } else { - err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK); + switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) { + case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC: + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; + } - if (err) - return -EINVAL; + /* multicast with context */ + pr_debug("dest: context-based mcast compression\n"); + err = lowpan_uncompress_multicast_ctx_daddr(skb, ci, + &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + break; + case LOWPAN_IPHC_M: + /* multicast */ + err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK); + break; + case LOWPAN_IPHC_DAC: + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; } - } else { + + /* Destination address context based uncompression */ + pr_debug("DAC bit is set. Handle context based destination address.\n"); + err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + break; + default: err = uncompress_addr(skb, dev, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); pr_debug("dest: stateless compression mode %d dest %pI6c\n", iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr); - if (err) - return -EINVAL; + break; } + if (err) + return -EINVAL; + /* Next header data uncompression */ if (iphc0 & LOWPAN_IPHC_NH) { err = lowpan_nhc_do_uncompression(skb, dev, &hdr); @@ -585,6 +769,58 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = { [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, }; +static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr, + const struct lowpan_iphc_ctx *ctx, + const unsigned char *lladdr, bool sam) +{ + struct in6_addr tmp = {}; + u8 dam; + + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], lladdr, 8); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + + memset(&tmp, 0, sizeof(tmp)); + /* check for SAM/DAM = 01 */ + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2); + dam = LOWPAN_IPHC_DAM_10; + goto out; + } + + memset(&tmp, 0, sizeof(tmp)); + /* check for SAM/DAM = 10, should always match */ + memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8); + dam = LOWPAN_IPHC_DAM_01; + goto out; + } + + WARN_ONCE(1, "context found but no address mode matched\n"); + return LOWPAN_IPHC_DAM_00; +out: + + if (sam) + return lowpan_iphc_dam_to_sam_value[dam]; + else + return dam; +} + static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, const unsigned char *lladdr, bool sam) { @@ -708,6 +944,21 @@ static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr) return val; } +static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr, + const struct lowpan_iphc_ctx *ctx, + const struct in6_addr *ipaddr) +{ + u8 data[6]; + + /* flags/scope, reserved (RIID) */ + memcpy(data, &ipaddr->s6_addr[1], 2); + /* group ID */ + memcpy(&data[1], &ipaddr->s6_addr[11], 4); + lowpan_push_hc_data(hc_ptr, data, 6); + + return LOWPAN_IPHC_DAM_00; +} + static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr, const struct in6_addr *ipaddr) { @@ -742,10 +993,11 @@ static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr, int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, const void *daddr, const void *saddr) { - u8 iphc0, iphc1, *hc_ptr; + u8 iphc0, iphc1, *hc_ptr, cid = 0; struct ipv6hdr *hdr; u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {}; - int ret, addr_type; + struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry; + int ret, ipv6_daddr_type, ipv6_saddr_type; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; @@ -769,14 +1021,38 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc0 = LOWPAN_DISPATCH_IPHC; iphc1 = 0; - /* TODO: context lookup */ - raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); + ipv6_daddr_type = ipv6_addr_type(&hdr->daddr); + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) + dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr); + else + dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr); + if (dci) { + memcpy(&dci_entry, dci, sizeof(*dci)); + cid |= dci->id; + } + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr); + if (sci) { + memcpy(&sci_entry, sci, sizeof(*sci)); + cid |= (sci->id << 4); + } + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + + /* if cid is zero it will be compressed */ + if (cid) { + iphc1 |= LOWPAN_IPHC_CID; + lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid)); + } + /* Traffic Class, Flow Label compression */ iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr); @@ -813,39 +1089,64 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, sizeof(hdr->hop_limit)); } - addr_type = ipv6_addr_type(&hdr->saddr); + ipv6_saddr_type = ipv6_addr_type(&hdr->saddr); /* source address compression */ - if (addr_type == IPV6_ADDR_ANY) { + if (ipv6_saddr_type == IPV6_ADDR_ANY) { pr_debug("source address is unspecified, setting SAC\n"); iphc1 |= LOWPAN_IPHC_SAC; } else { - if (addr_type & IPV6_ADDR_LINKLOCAL) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr, - saddr, true); - pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", - &hdr->saddr, iphc1); + if (sci) { + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr, + &sci_entry, saddr, + true); + iphc1 |= LOWPAN_IPHC_SAC; } else { - pr_debug("send the full source address\n"); - lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16); + if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) { + iphc1 |= lowpan_compress_addr_64(&hc_ptr, + &hdr->saddr, + saddr, true); + pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", + &hdr->saddr, iphc1); + } else { + pr_debug("send the full source address\n"); + lowpan_push_hc_data(&hc_ptr, + hdr->saddr.s6_addr, 16); + } } } - addr_type = ipv6_addr_type(&hdr->daddr); /* destination address compression */ - if (addr_type & IPV6_ADDR_MULTICAST) { + if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) { pr_debug("destination address is multicast: "); iphc1 |= LOWPAN_IPHC_M; - iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr); + if (dci) { + iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr, + &dci_entry, + &hdr->daddr); + iphc1 |= LOWPAN_IPHC_DAC; + } else { + iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, + &hdr->daddr); + } } else { - if (addr_type & IPV6_ADDR_LINKLOCAL) { - /* TODO: context lookup */ - iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr, - daddr, false); - pr_debug("dest address unicast link-local %pI6c " - "iphc1 0x%02x\n", &hdr->daddr, iphc1); + if (dci) { + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr, + &dci_entry, daddr, + false); + iphc1 |= LOWPAN_IPHC_DAC; } else { - pr_debug("dest address unicast %pI6c\n", &hdr->daddr); - lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16); + if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) { + iphc1 |= lowpan_compress_addr_64(&hc_ptr, + &hdr->daddr, + daddr, false); + pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n", + &hdr->daddr, iphc1); + } else { + pr_debug("dest address unicast %pI6c\n", + &hdr->daddr); + lowpan_push_hc_data(&hc_ptr, + hdr->daddr.s6_addr, 16); + } } } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index d2cd9de4b724..69929c05c843 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; - new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT); vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad5e2fd1012c..e416a4038a12 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -621,12 +621,12 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, return features; } -static int vlan_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) +static int vlan_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - return __ethtool_get_settings(vlan->real_dev, cmd); + return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, @@ -741,7 +741,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev) } static const struct ethtool_ops vlan_ethtool_ops = { - .get_settings = vlan_ethtool_get_settings, + .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, @@ -799,6 +799,7 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; + dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index ae63cf72a953..5f1446c9f098 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev) /* * Delete directory entry for VLAN device. */ -int vlan_proc_rem_dev(struct net_device *vlandev) +void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; - return 0; } /****** Proc filesystem entry points ****************************************/ diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h index 063f60a3d5cc..8838a2e92eb6 100644 --- a/net/8021q/vlanproc.h +++ b/net/8021q/vlanproc.h @@ -5,7 +5,7 @@ struct net; int vlan_proc_init(struct net *net); -int vlan_proc_rem_dev(struct net_device *vlandev); +void vlan_proc_rem_dev(struct net_device *vlandev); int vlan_proc_add_dev(struct net_device *vlandev); void vlan_proc_cleanup(struct net *net); @@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net); #define vlan_proc_init(net) (0) #define vlan_proc_cleanup(net) do {} while (0) #define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) -#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) do {} while (0) #endif #endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/Kconfig b/net/Kconfig index 174354618f8a..10640d5f8bee 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -360,6 +360,7 @@ source "net/can/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" +source "net/kcm/Kconfig" config FIB_RULES bool @@ -392,6 +393,26 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config DST_CACHE + bool "dst cache" + default n + +config NET_DEVLINK + tristate "Network physical/parent device Netlink interface" + help + Network physical/parent device Netlink interface provides + infrastructure to support access to physical chip-wide config and + monitoring. + +config MAY_USE_DEVLINK + tristate + default m if NET_DEVLINK=m + default y if NET_DEVLINK=y || NET_DEVLINK=n + help + Drivers using the devlink infrastructure should have a dependency + on MAY_USE_DEVLINK to ensure they do not cause link errors when + devlink is a loadable module and the driver using it is built-in. + endif # if NET # Used by archs to tell that they support BPF_JIT diff --git a/net/Makefile b/net/Makefile index a5d04098dfce..81d14119eab5 100644 --- a/net/Makefile +++ b/net/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ +obj-$(CONFIG_AF_KCM) += kcm/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index d5871ac493eb..f066781be3c8 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rt = atrtr_find(&at_hint); } - err = ENETUNREACH; + err = -ENETUNREACH; if (!rt) goto out; diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index b563a3f5f2a8..2fa3be965101 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -228,8 +228,23 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) } #endif +static bool ax25_validate_header(const char *header, unsigned int len) +{ + ax25_digi digi; + + if (!len) + return false; + + if (header[0]) + return true; + + return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL, + NULL); +} + const struct header_ops ax25_header_ops = { .create = ax25_hard_header, + .validate = ax25_validate_header, }; EXPORT_SYMBOL(ax25_header_ops); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 2dd40e5ea030..f66930ee3c0b 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -15,6 +15,20 @@ config BATMAN_ADV https://www.open-mesh.org/ for more information and user space tools. +config BATMAN_ADV_BATMAN_V + bool "B.A.T.M.A.N. V protocol (experimental)" + depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m) + default n + help + This option enables the B.A.T.M.A.N. V protocol, the successor + of the currently used B.A.T.M.A.N. IV protocol. The main + changes include splitting of the OGM protocol into a neighbor + discovery protocol (Echo Location Protocol, ELP) and a new OGM + Protocol OGMv2 for flooding protocol information through the + network, as well as a throughput based metric. + B.A.T.M.A.N. V is currently considered experimental and not + compatible to B.A.T.M.A.N. IV networks. + config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 207e2af316c7..797cf2fc88c1 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -18,6 +18,9 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_iv_ogm.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index a7485d676088..03dafd33d23b 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,6 +1,6 @@ /* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * - * Marek Lindner + * Marek Lindner, Linus Lüssing * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -18,6 +18,32 @@ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ #define _NET_BATMAN_ADV_BAT_ALGO_H_ +struct batadv_priv; + int batadv_iv_init(void); +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +int batadv_v_init(void); +int batadv_v_mesh_init(struct batadv_priv *bat_priv); +void batadv_v_mesh_free(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_v_init(void) +{ + return 0; +} + +static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ +} + +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index bf0e7d6f12bb..cb2d1b9b0340 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -287,8 +287,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) free_orig_node: /* free twice, as batadv_orig_node_new sets refcount to 2 */ - batadv_orig_node_free_ref(orig_node); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NULL; } @@ -478,7 +478,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); + batadv_send_broadcast_skb(skb, hard_iface); } } @@ -515,7 +515,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -617,7 +617,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return res; } @@ -711,9 +711,9 @@ out_nomem: if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); out_free_outgoing: - batadv_hardif_free_ref(if_outgoing); + batadv_hardif_put(if_outgoing); out_free_incoming: - batadv_hardif_free_ref(if_incoming); + batadv_hardif_put(if_incoming); } /* aggregate a new packet into the existing ogm packet */ @@ -958,7 +958,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1005,7 +1005,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, tmp_neigh_node->if_incoming == if_incoming && kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } @@ -1026,7 +1026,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } @@ -1041,7 +1041,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, ethhdr->h_source, orig_node, orig_tmp); - batadv_orig_node_free_ref(orig_tmp); + batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { @@ -1116,13 +1116,13 @@ unlock: rcu_read_unlock(); out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -1192,7 +1192,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } @@ -1265,7 +1265,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -1306,7 +1306,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 0; } @@ -1353,7 +1353,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); @@ -1367,8 +1367,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - batadv_orig_node_free_ref(orig_node); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_node_put(orig_node); + batadv_orig_ifinfo_put(orig_ifinfo); return ret; } @@ -1514,7 +1514,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, ogm_packet, if_incoming, if_outgoing, dup_status); } - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) @@ -1563,18 +1563,18 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, out_neigh: if ((orig_neigh_node) && (!is_single_hop_neigh)) - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_router) - batadv_neigh_node_free_ref(router_router); + batadv_neigh_node_put(router_router); if (orig_neigh_router) - batadv_neigh_node_free_ref(orig_neigh_router); + batadv_neigh_node_put(orig_neigh_router); if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); kfree_skb(skb_priv); } @@ -1697,7 +1697,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); return; } @@ -1735,7 +1735,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static int batadv_iv_ogm_receive(struct sk_buff *skb, @@ -1805,7 +1805,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, neigh_node->addr, n_ifinfo->bat_iv.tq_avg); - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } } @@ -1868,9 +1868,9 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, batman_count++; next: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (n_ifinfo) - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } rcu_read_unlock(); } @@ -1964,9 +1964,9 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return diff; } @@ -2007,9 +2007,9 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c new file mode 100644 index 000000000000..3315b9a598af --- /dev/null +++ b/net/batman-adv/bat_v.c @@ -0,0 +1,347 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_algo.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "bat_v_elp.h" +#include "bat_v_ogm.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" + +static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) +{ + int ret; + + ret = batadv_v_elp_iface_enable(hard_iface); + if (ret < 0) + return ret; + + ret = batadv_v_ogm_iface_enable(hard_iface); + if (ret < 0) + batadv_v_elp_iface_disable(hard_iface); + + /* enable link throughput auto-detection by setting the throughput + * override to zero + */ + atomic_set(&hard_iface->bat_v.throughput_override, 0); + + return ret; +} + +static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) +{ + batadv_v_elp_iface_disable(hard_iface); +} + +static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) +{ +} + +static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) +{ + batadv_v_elp_primary_iface_set(hard_iface); + batadv_v_ogm_primary_iface_set(hard_iface); +} + +static void +batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) +{ + ewma_throughput_init(&hardif_neigh->bat_v.throughput); + INIT_WORK(&hardif_neigh->bat_v.metric_work, + batadv_v_elp_throughput_metric_update); +} + +static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface) +{ +} + +static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet) +{ +} + +/** + * batadv_v_orig_print_neigh - print neighbors for the originator table + * @orig_node: the orig_node for which the neighbors are printed + * @if_outgoing: outgoing interface for these entries + * @seq: debugfs table seq_file struct + * + * Must be called while holding an rcu lock. + */ +static void +batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing, + struct seq_file *seq) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_neigh_ifinfo *n_ifinfo; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + continue; + + seq_printf(seq, " %pM (%9u.%1u)", + neigh_node->addr, + n_ifinfo->bat_v.throughput / 10, + n_ifinfo->bat_v.throughput % 10); + + batadv_neigh_ifinfo_put(n_ifinfo); + } +} + +/** + * batadv_v_hardif_neigh_print - print a single ELP neighbour node + * @seq: neighbour table seq_file struct + * @hardif_neigh: hardif neighbour information + */ +static void +batadv_v_hardif_neigh_print(struct seq_file *seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + int last_secs, last_msecs; + u32 throughput; + + last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000; + last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000; + throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + + seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n", + hardif_neigh->addr, last_secs, last_msecs, throughput / 10, + throughput % 10, hardif_neigh->if_incoming->net_dev->name); +} + +/** + * batadv_v_neigh_print - print the single hop neighbour list + * @bat_priv: the bat priv with all the soft interface information + * @seq: neighbour table seq_file struct + */ +static void batadv_v_neigh_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + int batman_count = 0; + + seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor", + "last-seen", "throughput", "IF"); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != net_dev) + continue; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + batadv_v_hardif_neigh_print(seq, hardif_neigh); + batman_count++; + } + } + rcu_read_unlock(); + + if (batman_count == 0) + seq_puts(seq, "No batman nodes in range ...\n"); +} + +/** + * batadv_v_orig_print - print the originator table + * @bat_priv: the bat priv with all the soft interface information + * @seq: debugfs table seq_file struct + * @if_outgoing: the outgoing interface for which this should be printed + */ +static void batadv_v_orig_print(struct batadv_priv *bat_priv, + struct seq_file *seq, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_hashtable *hash = bat_priv->orig_hash; + int last_seen_msecs, last_seen_secs; + struct batadv_orig_node *orig_node; + struct batadv_neigh_ifinfo *n_ifinfo; + unsigned long last_seen_jiffies; + struct hlist_head *head; + int batman_count = 0; + u32 i; + + seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n", + "Originator", "last-seen", "throughput", "Nexthop", + "outgoingIF", "Potential nexthops"); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + neigh_node = batadv_orig_router_get(orig_node, + if_outgoing); + if (!neigh_node) + continue; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, + if_outgoing); + if (!n_ifinfo) + goto next; + + last_seen_jiffies = jiffies - orig_node->last_seen; + last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); + last_seen_secs = last_seen_msecs / 1000; + last_seen_msecs = last_seen_msecs % 1000; + + seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:", + orig_node->orig, last_seen_secs, + last_seen_msecs, + n_ifinfo->bat_v.throughput / 10, + n_ifinfo->bat_v.throughput % 10, + neigh_node->addr, + neigh_node->if_incoming->net_dev->name); + + batadv_v_orig_print_neigh(orig_node, if_outgoing, seq); + seq_puts(seq, "\n"); + batman_count++; + +next: + batadv_neigh_node_put(neigh_node); + if (n_ifinfo) + batadv_neigh_ifinfo_put(n_ifinfo); + } + rcu_read_unlock(); + } + + if (batman_count == 0) + seq_puts(seq, "No batman nodes in range ...\n"); +} + +static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; + + ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + + if (WARN_ON(!ifinfo1 || !ifinfo2)) + return 0; + + return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; +} + +static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; + u32 threshold; + + ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + + threshold = ifinfo1->bat_v.throughput / 4; + threshold = ifinfo1->bat_v.throughput - threshold; + + return ifinfo2->bat_v.throughput > threshold; +} + +static struct batadv_algo_ops batadv_batman_v __read_mostly = { + .name = "BATMAN_V", + .bat_iface_enable = batadv_v_iface_enable, + .bat_iface_disable = batadv_v_iface_disable, + .bat_iface_update_mac = batadv_v_iface_update_mac, + .bat_primary_iface_set = batadv_v_primary_iface_set, + .bat_hardif_neigh_init = batadv_v_hardif_neigh_init, + .bat_ogm_emit = batadv_v_ogm_emit, + .bat_ogm_schedule = batadv_v_ogm_schedule, + .bat_orig_print = batadv_v_orig_print, + .bat_neigh_cmp = batadv_v_neigh_cmp, + .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob, + .bat_neigh_print = batadv_v_neigh_print, +}; + +/** + * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a + * mesh + * @bat_priv: the object representing the mesh interface to initialise + * + * Return: 0 on success or a negative error code otherwise + */ +int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return batadv_v_ogm_init(bat_priv); +} + +/** + * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh + * @bat_priv: the object representing the mesh interface to free + */ +void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ + batadv_v_ogm_free(bat_priv); +} + +/** + * batadv_v_init - B.A.T.M.A.N. V initialization function + * + * Description: Takes care of initializing all the subcomponents. + * It is invoked upon module load only. + * + * Return: 0 on success or a negative error code otherwise + */ +int __init batadv_v_init(void) +{ + int ret; + + /* B.A.T.M.A.N. V echo location protocol packet */ + ret = batadv_recv_handler_register(BATADV_ELP, + batadv_v_elp_packet_recv); + if (ret < 0) + return ret; + + ret = batadv_recv_handler_register(BATADV_OGM2, + batadv_v_ogm_packet_recv); + if (ret < 0) + goto elp_unregister; + + ret = batadv_algo_register(&batadv_batman_v); + if (ret < 0) + goto ogm_unregister; + + return ret; + +ogm_unregister: + batadv_recv_handler_unregister(BATADV_OGM2); + +elp_unregister: + batadv_recv_handler_unregister(BATADV_ELP); + + return ret; +} diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c new file mode 100644 index 000000000000..3844e7efd0b0 --- /dev/null +++ b/net/batman-adv/bat_v_elp.c @@ -0,0 +1,515 @@ +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_v_elp.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kref.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/cfg80211.h> + +#include "bat_algo.h" +#include "bat_v_ogm.h" +#include "hard-interface.h" +#include "originator.h" +#include "packet.h" +#include "routing.h" +#include "send.h" + +/** + * batadv_v_elp_start_timer - restart timer for ELP periodic work + * @hard_iface: the interface for which the timer has to be reset + */ +static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) +{ + unsigned int msecs; + + msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; + msecs += prandom_u32() % (2 * BATADV_JITTER); + + queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, + msecs_to_jiffies(msecs)); +} + +/** + * batadv_v_elp_get_throughput - get the throughput towards a neighbour + * @neigh: the neighbour for which the throughput has to be obtained + * + * Return: The throughput towards the given neighbour in multiples of 100kpbs + * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + */ +static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +{ + struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct ethtool_link_ksettings link_settings; + struct station_info sinfo; + u32 throughput; + int ret; + + /* if the user specified a customised value for this interface, then + * return it directly + */ + throughput = atomic_read(&hard_iface->bat_v.throughput_override); + if (throughput != 0) + return throughput; + + /* if this is a wireless device, then ask its throughput through + * cfg80211 API + */ + if (batadv_is_wifi_netdev(hard_iface->net_dev)) { + if (hard_iface->net_dev->ieee80211_ptr) { + ret = cfg80211_get_station(hard_iface->net_dev, + neigh->addr, &sinfo); + if (ret == -ENOENT) { + /* Node is not associated anymore! It would be + * possible to delete this neighbor. For now set + * the throughput metric to 0. + */ + return 0; + } + if (!ret) + return sinfo.expected_throughput / 100; + } + + /* unsupported WiFi driver version */ + goto default_throughput; + } + + /* if not a wifi interface, check if this device provides data via + * ethtool (e.g. an Ethernet adapter) + */ + memset(&link_settings, 0, sizeof(link_settings)); + rtnl_lock(); + ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); + rtnl_unlock(); + if (ret == 0) { + /* link characteristics might change over time */ + if (link_settings.base.duplex == DUPLEX_FULL) + hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; + else + hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; + + throughput = link_settings.base.speed; + if (throughput && (throughput != SPEED_UNKNOWN)) + return throughput * 10; + } + +default_throughput: + if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { + batadv_info(hard_iface->soft_iface, + "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", + hard_iface->net_dev->name, + BATADV_THROUGHPUT_DEFAULT_VALUE / 10, + BATADV_THROUGHPUT_DEFAULT_VALUE % 10); + hard_iface->bat_v.flags |= BATADV_WARNING_DEFAULT; + } + + /* if none of the above cases apply, return the base_throughput */ + return BATADV_THROUGHPUT_DEFAULT_VALUE; +} + +/** + * batadv_v_elp_throughput_metric_update - worker updating the throughput metric + * of a single hop neighbour + * @work: the work queue item + */ +void batadv_v_elp_throughput_metric_update(struct work_struct *work) +{ + struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; + struct batadv_hardif_neigh_node *neigh; + + neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, + metric_work); + neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, + bat_v); + + ewma_throughput_add(&neigh->bat_v.throughput, + batadv_v_elp_get_throughput(neigh)); + + /* decrement refcounter to balance increment performed before scheduling + * this task + */ + batadv_hardif_neigh_put(neigh); +} + +/** + * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour + * @neigh: the neighbour to probe + * + * Sends a predefined number of unicast wifi packets to a given neighbour in + * order to trigger the throughput estimation on this link by the RC algorithm. + * Packets are sent only if there there is not enough payload unicast traffic + * towards this neighbour.. + * + * Return: True on success and false in case of error during skb preparation. + */ +static bool +batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) +{ + struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + unsigned long last_tx_diff; + struct sk_buff *skb; + int probe_len, i; + int elp_skb_len; + + /* this probing routine is for Wifi neighbours only */ + if (!batadv_is_wifi_netdev(hard_iface->net_dev)) + return true; + + /* probe the neighbor only if no unicast packets have been sent + * to it in the last 100 milliseconds: this is the rate control + * algorithm sampling interval (minstrel). In this way, if not + * enough traffic has been sent to the neighbor, batman-adv can + * generate 2 probe packets and push the RC algorithm to perform + * the sampling + */ + last_tx_diff = jiffies_to_msecs(jiffies - neigh->bat_v.last_unicast_tx); + if (last_tx_diff <= BATADV_ELP_PROBE_MAX_TX_DIFF) + return true; + + probe_len = max_t(int, sizeof(struct batadv_elp_packet), + BATADV_ELP_MIN_PROBE_SIZE); + + for (i = 0; i < BATADV_ELP_PROBES_PER_NODE; i++) { + elp_skb_len = hard_iface->bat_v.elp_skb->len; + skb = skb_copy_expand(hard_iface->bat_v.elp_skb, 0, + probe_len - elp_skb_len, + GFP_ATOMIC); + if (!skb) + return false; + + /* Tell the skb to get as big as the allocated space (we want + * the packet to be exactly of that size to make the link + * throughput estimation effective. + */ + skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending unicast (probe) ELP packet on interface %s to %pM\n", + hard_iface->net_dev->name, neigh->addr); + + batadv_send_skb_packet(skb, hard_iface, neigh->addr); + } + + return true; +} + +/** + * batadv_v_elp_periodic_work - ELP periodic task per interface + * @work: work queue item + * + * Emits broadcast ELP message in regular intervals. + */ +static void batadv_v_elp_periodic_work(struct work_struct *work) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface_bat_v *bat_v; + struct batadv_elp_packet *elp_packet; + struct batadv_priv *bat_priv; + struct sk_buff *skb; + u32 elp_interval; + + bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); + hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); + bat_priv = netdev_priv(hard_iface->soft_iface); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + /* we are in the process of shutting this interface down */ + if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || + (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + goto out; + + /* the interface was enabled but may not be ready yet */ + if (hard_iface->if_status != BATADV_IF_ACTIVE) + goto restart_timer; + + skb = skb_copy(hard_iface->bat_v.elp_skb, GFP_ATOMIC); + if (!skb) + goto restart_timer; + + elp_packet = (struct batadv_elp_packet *)skb->data; + elp_packet->seqno = htonl(atomic_read(&hard_iface->bat_v.elp_seqno)); + elp_interval = atomic_read(&hard_iface->bat_v.elp_interval); + elp_packet->elp_interval = htonl(elp_interval); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending broadcast ELP packet on interface %s, seqno %u\n", + hard_iface->net_dev->name, + atomic_read(&hard_iface->bat_v.elp_seqno)); + + batadv_send_broadcast_skb(skb, hard_iface); + + atomic_inc(&hard_iface->bat_v.elp_seqno); + + /* The throughput metric is updated on each sent packet. This way, if a + * node is dead and no longer sends packets, batman-adv is still able to + * react timely to its death. + * + * The throughput metric is updated by following these steps: + * 1) if the hard_iface is wifi => send a number of unicast ELPs for + * probing/sampling to each neighbor + * 2) update the throughput metric value of each neighbor (note that the + * value retrieved in this step might be 100ms old because the + * probing packets at point 1) could still be in the HW queue) + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { + if (!batadv_v_elp_wifi_neigh_probe(hardif_neigh)) + /* if something goes wrong while probing, better to stop + * sending packets immediately and reschedule the task + */ + break; + + if (!kref_get_unless_zero(&hardif_neigh->refcount)) + continue; + + /* Reading the estimated throughput from cfg80211 is a task that + * may sleep and that is not allowed in an rcu protected + * context. Therefore schedule a task for that. + */ + queue_work(batadv_event_workqueue, + &hardif_neigh->bat_v.metric_work); + } + rcu_read_unlock(); + +restart_timer: + batadv_v_elp_start_timer(hard_iface); +out: + return; +} + +/** + * batadv_v_elp_iface_enable - setup the ELP interface private resources + * @hard_iface: interface for which the data has to be prepared + * + * Return: 0 on success or a -ENOMEM in case of failure. + */ +int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) +{ + struct batadv_elp_packet *elp_packet; + unsigned char *elp_buff; + u32 random_seqno; + size_t size; + int res = -ENOMEM; + + size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN; + hard_iface->bat_v.elp_skb = dev_alloc_skb(size); + if (!hard_iface->bat_v.elp_skb) + goto out; + + skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); + elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_packet = (struct batadv_elp_packet *)elp_buff; + memset(elp_packet, 0, BATADV_ELP_HLEN); + + elp_packet->packet_type = BATADV_ELP; + elp_packet->version = BATADV_COMPAT_VERSION; + + /* randomize initial seqno to avoid collision */ + get_random_bytes(&random_seqno, sizeof(random_seqno)); + atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno); + atomic_set(&hard_iface->bat_v.elp_interval, 500); + + /* assume full-duplex by default */ + hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; + + /* warn the user (again) if there is no throughput data is available */ + hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT; + + if (batadv_is_wifi_netdev(hard_iface->net_dev)) + hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; + + INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq, + batadv_v_elp_periodic_work); + batadv_v_elp_start_timer(hard_iface); + res = 0; + +out: + return res; +} + +/** + * batadv_v_elp_iface_disable - release ELP interface private resources + * @hard_iface: interface for which the resources have to be released + */ +void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) +{ + cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq); + + dev_kfree_skb(hard_iface->bat_v.elp_skb); + hard_iface->bat_v.elp_skb = NULL; +} + +/** + * batadv_v_elp_primary_iface_set - change internal data to reflect the new + * primary interface + * @primary_iface: the new primary interface + */ +void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_elp_packet *elp_packet; + struct sk_buff *skb; + + /* update orig field of every elp iface belonging to this mesh */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (primary_iface->soft_iface != hard_iface->soft_iface) + continue; + + if (!hard_iface->bat_v.elp_skb) + continue; + + skb = hard_iface->bat_v.elp_skb; + elp_packet = (struct batadv_elp_packet *)skb->data; + ether_addr_copy(elp_packet->orig, + primary_iface->net_dev->dev_addr); + } + rcu_read_unlock(); +} + +/** + * batadv_v_elp_neigh_update - update an ELP neighbour node + * @bat_priv: the bat priv with all the soft interface information + * @neigh_addr: the neighbour interface address + * @if_incoming: the interface the packet was received through + * @elp_packet: the received ELP packet + * + * Updates the ELP neighbour node state with the data received within the new + * ELP packet. + */ +static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, + u8 *neigh_addr, + struct batadv_hard_iface *if_incoming, + struct batadv_elp_packet *elp_packet) + +{ + struct batadv_neigh_node *neigh; + struct batadv_orig_node *orig_neigh; + struct batadv_hardif_neigh_node *hardif_neigh; + s32 seqno_diff; + s32 elp_latest_seqno; + + orig_neigh = batadv_v_ogm_orig_get(bat_priv, elp_packet->orig); + if (!orig_neigh) + return; + + neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr); + if (!neigh) + goto orig_free; + + hardif_neigh = batadv_hardif_neigh_get(if_incoming, neigh_addr); + if (!hardif_neigh) + goto neigh_free; + + elp_latest_seqno = hardif_neigh->bat_v.elp_latest_seqno; + seqno_diff = ntohl(elp_packet->seqno) - elp_latest_seqno; + + /* known or older sequence numbers are ignored. However always adopt + * if the router seems to have been restarted. + */ + if (seqno_diff < 1 && seqno_diff > -BATADV_ELP_MAX_AGE) + goto hardif_free; + + neigh->last_seen = jiffies; + hardif_neigh->last_seen = jiffies; + hardif_neigh->bat_v.elp_latest_seqno = ntohl(elp_packet->seqno); + hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval); + +hardif_free: + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +neigh_free: + if (neigh) + batadv_neigh_node_put(neigh); +orig_free: + if (orig_neigh) + batadv_orig_node_put(orig_neigh); +} + +/** + * batadv_v_elp_packet_recv - main ELP packet handler + * @skb: the received packet + * @if_incoming: the interface this packet was received through + * + * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * processed or NET_RX_DROP in case of failure. + */ +int batadv_v_elp_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_elp_packet *elp_packet; + struct batadv_hard_iface *primary_if; + struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + bool ret; + + ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN); + if (!ret) + return NET_RX_DROP; + + if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) + return NET_RX_DROP; + + /* did we receive a B.A.T.M.A.N. V ELP packet on an interface + * that does not have B.A.T.M.A.N. V ELP enabled ? + */ + if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + return NET_RX_DROP; + + elp_packet = (struct batadv_elp_packet *)skb->data; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Received ELP packet from %pM seqno %u ORIG: %pM\n", + ethhdr->h_source, ntohl(elp_packet->seqno), + elp_packet->orig); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming, + elp_packet); + +out: + if (primary_if) + batadv_hardif_put(primary_if); + consume_skb(skb); + return NET_RX_SUCCESS; +} diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h new file mode 100644 index 000000000000..e95f1bca0785 --- /dev/null +++ b/net/batman-adv/bat_v_elp.h @@ -0,0 +1,33 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "main.h" + +#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ +#define _NET_BATMAN_ADV_BAT_V_ELP_H_ + +struct sk_buff; +struct work_struct; + +int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); +void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); +void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); +int batadv_v_elp_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming); +void batadv_v_elp_throughput_metric_update(struct work_struct *work); + +#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c new file mode 100644 index 000000000000..d9bcbe6e7d65 --- /dev/null +++ b/net/batman-adv/bat_v_ogm.c @@ -0,0 +1,833 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_v_ogm.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" +#include "routing.h" +#include "send.h" +#include "translation-table.h" + +/** + * batadv_v_ogm_orig_get - retrieve and possibly create an originator node + * @bat_priv: the bat priv with all the soft interface information + * @addr: the address of the originator + * + * Return: the orig_node corresponding to the specified address. If such object + * does not exist it is allocated here. In case of allocation failure returns + * NULL. + */ +struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, + const u8 *addr) +{ + struct batadv_orig_node *orig_node; + int hash_added; + + orig_node = batadv_orig_hash_find(bat_priv, addr); + if (orig_node) + return orig_node; + + orig_node = batadv_orig_node_new(bat_priv, addr); + if (!orig_node) + return NULL; + + hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, + batadv_choose_orig, orig_node, + &orig_node->hash_entry); + if (hash_added != 0) { + /* orig_node->refcounter is initialised to 2 by + * batadv_orig_node_new() + */ + batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); + orig_node = NULL; + } + + return orig_node; +} + +/** + * batadv_v_ogm_start_timer - restart the OGM sending timer + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) +{ + unsigned long msecs; + /* this function may be invoked in different contexts (ogm rescheduling + * or hard_iface activation), but the work timer should not be reset + */ + if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) + return; + + msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; + msecs += prandom_u32() % (2 * BATADV_JITTER); + queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, + msecs_to_jiffies(msecs)); +} + +/** + * batadv_v_ogm_send_to_if - send a batman ogm using a given interface + * @skb: the OGM to send + * @hard_iface: the interface to use to send the OGM + */ +static void batadv_v_ogm_send_to_if(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + + if (hard_iface->if_status != BATADV_IF_ACTIVE) + return; + + batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); + batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, + skb->len + ETH_HLEN); + + batadv_send_broadcast_skb(skb, hard_iface); +} + +/** + * batadv_v_ogm_send - periodic worker broadcasting the own OGM + * @work: work queue item + */ +static void batadv_v_ogm_send(struct work_struct *work) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_priv_bat_v *bat_v; + struct batadv_priv *bat_priv; + struct batadv_ogm2_packet *ogm_packet; + struct sk_buff *skb, *skb_tmp; + unsigned char *ogm_buff, *pkt_buff; + int ogm_buff_len; + u16 tvlv_len = 0; + + bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); + bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + ogm_buff = bat_priv->bat_v.ogm_buff; + ogm_buff_len = bat_priv->bat_v.ogm_buff_len; + /* tt changes have to be committed before the tvlv data is + * appended as it may alter the tt tvlv container + */ + batadv_tt_local_commit_changes(bat_priv); + tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, + &ogm_buff_len, + BATADV_OGM2_HLEN); + + bat_priv->bat_v.ogm_buff = ogm_buff; + bat_priv->bat_v.ogm_buff_len = ogm_buff_len; + + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); + if (!skb) + goto reschedule; + + skb_reserve(skb, ETH_HLEN); + pkt_buff = skb_put(skb, ogm_buff_len); + memcpy(pkt_buff, ogm_buff, ogm_buff_len); + + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); + atomic_inc(&bat_priv->bat_v.ogm_seqno); + ogm_packet->tvlv_len = htons(tvlv_len); + + /* broadcast on every interface */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", + ogm_packet->orig, ntohl(ogm_packet->seqno), + ntohl(ogm_packet->throughput), ogm_packet->ttl, + hard_iface->net_dev->name, + hard_iface->net_dev->dev_addr); + + /* this skb gets consumed by batadv_v_ogm_send_to_if() */ + skb_tmp = skb_clone(skb, GFP_ATOMIC); + if (!skb_tmp) + break; + + batadv_v_ogm_send_to_if(skb_tmp, hard_iface); + } + rcu_read_unlock(); + + consume_skb(skb); + +reschedule: + batadv_v_ogm_start_timer(bat_priv); +out: + return; +} + +/** + * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V + * @hard_iface: the interface to prepare + * + * Takes care of scheduling own OGM sending routine for this interface. + * + * Return: 0 on success or a negative error code otherwise + */ +int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + + batadv_v_ogm_start_timer(bat_priv); + + return 0; +} + +/** + * batadv_v_ogm_primary_iface_set - set a new primary interface + * @primary_iface: the new primary interface + */ +void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); + struct batadv_ogm2_packet *ogm_packet; + + if (!bat_priv->bat_v.ogm_buff) + return; + + ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; + ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); +} + +/** + * batadv_v_ogm_orig_update - update the originator status based on the received + * OGM + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: the originator to update + * @neigh_node: the neighbour the OGM has been received from (to update) + * @ogm2: the received OGM + * @if_outgoing: the interface where this OGM is going to be forwarded through + */ +static void +batadv_v_ogm_orig_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + const struct batadv_ogm2_packet *ogm2, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; + struct batadv_neigh_node *router = NULL; + s32 neigh_seq_diff; + u32 neigh_last_seqno; + u32 router_last_seqno; + u32 router_throughput, neigh_throughput; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Searching and updating originator entry of received packet\n"); + + /* if this neighbor already is our next hop there is nothing + * to change + */ + router = batadv_orig_router_get(orig_node, if_outgoing); + if (router == neigh_node) + goto out; + + /* don't consider neighbours with worse throughput. + * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than + * the last received seqno from our best next hop. + */ + if (router) { + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + + /* if these are not allocated, something is wrong. */ + if (!router_ifinfo || !neigh_ifinfo) + goto out; + + neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; + router_last_seqno = router_ifinfo->bat_v.last_seqno; + neigh_seq_diff = neigh_last_seqno - router_last_seqno; + router_throughput = router_ifinfo->bat_v.throughput; + neigh_throughput = neigh_ifinfo->bat_v.throughput; + + if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && + (router_throughput >= neigh_throughput)) + goto out; + } + + batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); + if (router) + batadv_neigh_node_put(router); +} + +/** + * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded + * with B.A.T.M.A.N. V OGMs + * @bat_priv: the bat priv with all the soft interface information + * @if_incoming: the interface where the OGM has been received + * @if_outgoing: the interface where the OGM has to be forwarded to + * @throughput: the current throughput + * + * Apply a penalty on the current throughput metric value based on the + * characteristic of the interface where the OGM has been received. The return + * value is computed as follows: + * - throughput * 50% if the incoming and outgoing interface are the + * same WiFi interface and the throughput is above + * 1MBit/s + * - throughput if the outgoing interface is the default + * interface (i.e. this OGM is processed for the + * internal table and not forwarded) + * - throughput * hop penalty otherwise + * + * Return: the penalised throughput metric. + */ +static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + u32 throughput) +{ + int hop_penalty = atomic_read(&bat_priv->hop_penalty); + int hop_penalty_max = BATADV_TQ_MAX_VALUE; + + /* Don't apply hop penalty in default originator table. */ + if (if_outgoing == BATADV_IF_DEFAULT) + return throughput; + + /* Forwarding on the same WiFi interface cuts the throughput in half + * due to the store & forward characteristics of WIFI. + * Very low throughput values are the exception. + */ + if ((throughput > 10) && + (if_incoming == if_outgoing) && + !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) + return throughput / 2; + + /* hop penalty of 255 equals 100% */ + return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; +} + +/** + * batadv_v_ogm_forward - forward an OGM to the given outgoing interface + * @bat_priv: the bat priv with all the soft interface information + * @ogm_received: previously received OGM to be forwarded + * @throughput: throughput to announce, may vary per outgoing interface + * @if_incoming: the interface on which this OGM was received on + * @if_outgoing: the interface to which the OGM has to be forwarded to + * + * Forward an OGM to an interface after having altered the throughput metric and + * the TTL value contained in it. The original OGM isn't modified. + */ +static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, + const struct batadv_ogm2_packet *ogm_received, + u32 throughput, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_ogm2_packet *ogm_forward; + unsigned char *skb_buff; + struct sk_buff *skb; + size_t packet_len; + u16 tvlv_len; + + if (ogm_received->ttl <= 1) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); + return; + } + + tvlv_len = ntohs(ogm_received->tvlv_len); + + packet_len = BATADV_OGM2_HLEN + tvlv_len; + skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, + ETH_HLEN + packet_len); + if (!skb) + return; + + skb_reserve(skb, ETH_HLEN); + skb_buff = skb_put(skb, packet_len); + memcpy(skb_buff, ogm_received, packet_len); + + /* apply forward penalty */ + ogm_forward = (struct batadv_ogm2_packet *)skb_buff; + ogm_forward->throughput = htonl(throughput); + ogm_forward->ttl--; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", + if_outgoing->net_dev->name, throughput, ogm_forward->ttl, + if_incoming->net_dev->name); + + batadv_v_ogm_send_to_if(skb, if_outgoing); +} + +/** + * batadv_v_ogm_metric_update - update route metric based on OGM + * @bat_priv: the bat priv with all the soft interface information + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + * + * Return: + * 1 if the OGM is new, + * 0 if it is not new but valid, + * <0 on error (e.g. old OGM) + */ +static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + bool protection_started = false; + int ret = -EINVAL; + u32 path_throughput; + s32 seq_diff; + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; + + if (!hlist_empty(&orig_node->neigh_list) && + batadv_window_protected(bat_priv, seq_diff, + BATADV_OGM_MAX_AGE, + &orig_ifinfo->batman_seqno_reset, + &protection_started)) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: packet within window protection time from %pM\n", + ogm2->orig); + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Last reset: %ld, %ld\n", + orig_ifinfo->batman_seqno_reset, jiffies); + goto out; + } + + /* drop packets with old seqnos, however accept the first packet after + * a host has been rebooted. + */ + if ((seq_diff < 0) && !protection_started) + goto out; + + neigh_node->last_seen = jiffies; + + orig_node->last_seen = jiffies; + + orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); + orig_ifinfo->last_ttl = ogm2->ttl; + + neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + + path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, + if_outgoing, + ntohl(ogm2->throughput)); + neigh_ifinfo->bat_v.throughput = path_throughput; + neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); + neigh_ifinfo->last_ttl = ogm2->ttl; + + if (seq_diff > 0 || protection_started) + ret = 1; + else + ret = 0; +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); + + return ret; +} + +/** + * batadv_v_ogm_route_update - update routes based on OGM + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: the Ethernet header of the OGM2 + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + */ +static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, + const struct ethhdr *ethhdr, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *router = NULL; + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + struct batadv_orig_node *orig_neigh_node = NULL; + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_node *orig_neigh_router = NULL; + + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + + orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); + if (!orig_neigh_node) + goto out; + + orig_neigh_router = batadv_orig_router_get(orig_neigh_node, + if_outgoing); + + /* drop packet if sender is not a direct neighbor and if we + * don't route towards it + */ + router = batadv_orig_router_get(orig_node, if_outgoing); + if (router && router->orig_node != orig_node && !orig_neigh_router) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: OGM via unknown neighbor!\n"); + goto out; + } + + if (router) + batadv_neigh_node_put(router); + + /* Update routes, and check if the OGM is from the best next hop */ + batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2, + if_outgoing); + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + /* don't forward the same seqno twice on one interface */ + if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno)) + goto out; + + /* acquire possibly updated router */ + router = batadv_orig_router_get(orig_node, if_outgoing); + + /* strict rule: forward packets coming from the best next hop only */ + if (neigh_node != router) + goto out; + + /* only forward for specific interface, not for the default one. */ + if (if_outgoing != BATADV_IF_DEFAULT) { + orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno); + batadv_v_ogm_forward(bat_priv, ogm2, + neigh_ifinfo->bat_v.throughput, + if_incoming, if_outgoing); + } + +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (router) + batadv_neigh_node_put(router); + if (orig_neigh_router) + batadv_neigh_node_put(orig_neigh_router); + if (orig_neigh_node) + batadv_orig_node_put(orig_neigh_node); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); +} + +/** + * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: the Ethernet header of the OGM2 + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + */ +static void +batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, + const struct ethhdr *ethhdr, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + int seqno_age; + + /* first, update the metric with according sanity checks */ + seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, + neigh_node, if_incoming, + if_outgoing); + + /* outdated sequence numbers are to be discarded */ + if (seqno_age < 0) + return; + + /* only unknown & newer OGMs contain TVLVs we are interested in */ + if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT)) + batadv_tvlv_containers_process(bat_priv, true, orig_node, + NULL, NULL, + (unsigned char *)(ogm2 + 1), + ntohs(ogm2->tvlv_len)); + + /* if the metric update went through, update routes if needed */ + batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, + neigh_node, if_incoming, if_outgoing); +} + +/** + * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated + * @buff_pos: current position in the skb + * @packet_len: total length of the skb + * @tvlv_len: tvlv length of the previously considered OGM + * + * Return: true if there is enough space for another OGM, false otherwise. + */ +static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, + __be16 tvlv_len) +{ + int next_buff_pos = 0; + + next_buff_pos += buff_pos + BATADV_OGM2_HLEN; + next_buff_pos += ntohs(tvlv_len); + + return (next_buff_pos <= packet_len) && + (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); +} + +/** + * batadv_v_ogm_process - process an incoming batman v OGM + * @skb: the skb containing the OGM + * @ogm_offset: offset to the OGM which should be processed (for aggregates) + * @if_incoming: the interface where this packet was receved + */ +static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct ethhdr *ethhdr; + struct batadv_orig_node *orig_node = NULL; + struct batadv_hardif_neigh_node *hardif_neigh = NULL; + struct batadv_neigh_node *neigh_node = NULL; + struct batadv_hard_iface *hard_iface; + struct batadv_ogm2_packet *ogm_packet; + u32 ogm_throughput, link_throughput, path_throughput; + + ethhdr = eth_hdr(skb); + ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); + + ogm_throughput = ntohl(ogm_packet->throughput); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, troughput %u, TTL %u, V %u, tvlv_len %u)\n", + ethhdr->h_source, if_incoming->net_dev->name, + if_incoming->net_dev->dev_addr, ogm_packet->orig, + ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, + ogm_packet->version, ntohs(ogm_packet->tvlv_len)); + + /* If the troughput metric is 0, immediately drop the packet. No need to + * create orig_node / neigh_node for an unusable route. + */ + if (ogm_throughput == 0) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: originator packet with troughput metric of 0\n"); + return; + } + + /* require ELP packets be to received from this neighbor first */ + hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); + if (!hardif_neigh) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: OGM via unknown neighbor!\n"); + goto out; + } + + orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); + if (!orig_node) + return; + + neigh_node = batadv_neigh_node_new(orig_node, if_incoming, + ethhdr->h_source); + if (!neigh_node) + goto out; + + /* Update the received throughput metric to match the link + * characteristic: + * - If this OGM traveled one hop so far (emitted by single hop + * neighbor) the path throughput metric equals the link throughput. + * - For OGMs traversing more than hop the path throughput metric is + * the smaller of the path throughput and the link throughput. + */ + link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + path_throughput = min_t(u32, link_throughput, ogm_throughput); + ogm_packet->throughput = htonl(path_throughput); + + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, + neigh_node, if_incoming, + BATADV_IF_DEFAULT); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, + orig_node, neigh_node, + if_incoming, hard_iface); + } + rcu_read_unlock(); +out: + if (orig_node) + batadv_orig_node_put(orig_node); + if (neigh_node) + batadv_neigh_node_put(neigh_node); + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +} + +/** + * batadv_v_ogm_packet_recv - OGM2 receiving handler + * @skb: the received OGM + * @if_incoming: the interface where this OGM has been received + * + * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP + * (without freeing the skb) on failure + */ +int batadv_v_ogm_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_ogm2_packet *ogm_packet; + struct ethhdr *ethhdr = eth_hdr(skb); + int ogm_offset; + u8 *packet_pos; + int ret = NET_RX_DROP; + + /* did we receive a OGM2 packet on an interface that does not have + * B.A.T.M.A.N. V enabled ? + */ + if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + return NET_RX_DROP; + + if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) + return NET_RX_DROP; + + if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) + return NET_RX_DROP; + + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + + if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) + return NET_RX_DROP; + + batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); + batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, + skb->len + ETH_HLEN); + + ogm_offset = 0; + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + + while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), + ogm_packet->tvlv_len)) { + batadv_v_ogm_process(skb, ogm_offset, if_incoming); + + ogm_offset += BATADV_OGM2_HLEN; + ogm_offset += ntohs(ogm_packet->tvlv_len); + + packet_pos = skb->data + ogm_offset; + ogm_packet = (struct batadv_ogm2_packet *)packet_pos; + } + + ret = NET_RX_SUCCESS; + consume_skb(skb); + + return ret; +} + +/** + * batadv_v_ogm_init - initialise the OGM2 engine + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or a negative error code in case of failure + */ +int batadv_v_ogm_init(struct batadv_priv *bat_priv) +{ + struct batadv_ogm2_packet *ogm_packet; + unsigned char *ogm_buff; + u32 random_seqno; + + bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; + ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); + if (!ogm_buff) + return -ENOMEM; + + bat_priv->bat_v.ogm_buff = ogm_buff; + ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; + ogm_packet->packet_type = BATADV_OGM2; + ogm_packet->version = BATADV_COMPAT_VERSION; + ogm_packet->ttl = BATADV_TTL; + ogm_packet->flags = BATADV_NO_FLAGS; + ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); + + /* randomize initial seqno to avoid collision */ + get_random_bytes(&random_seqno, sizeof(random_seqno)); + atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); + INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); + + return 0; +} + +/** + * batadv_v_ogm_free - free OGM private resources + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_v_ogm_free(struct batadv_priv *bat_priv) +{ + cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); + + kfree(bat_priv->bat_v.ogm_buff); + bat_priv->bat_v.ogm_buff = NULL; + bat_priv->bat_v.ogm_buff_len = 0; +} diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h new file mode 100644 index 000000000000..d849c75ada0e --- /dev/null +++ b/net/batman-adv/bat_v_ogm.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ +#define _BATMAN_ADV_BATADV_V_OGM_H_ + +#include <linux/types.h> + +struct batadv_hard_iface; +struct batadv_priv; +struct sk_buff; + +int batadv_v_ogm_init(struct batadv_priv *bat_priv); +void batadv_v_ogm_free(struct batadv_priv *bat_priv); +int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface); +struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, + const u8 *addr); +void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); +int batadv_v_ogm_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming); + +#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 7781f39c174d..0a6c8b824a00 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -159,12 +159,11 @@ static void batadv_backbone_gw_release(struct kref *ref) } /** - * batadv_backbone_gw_free_ref - decrement the backbone gw refcounter and - * possibly release it + * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly + * release it * @backbone_gw: backbone gateway to be free'd */ -static void -batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) +static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } @@ -180,16 +179,16 @@ static void batadv_claim_release(struct kref *ref) claim = container_of(ref, struct batadv_bla_claim, refcount); - batadv_backbone_gw_free_ref(claim->backbone_gw); + batadv_backbone_gw_put(claim->backbone_gw); kfree_rcu(claim, rcu); } /** - * batadv_claim_free_ref - decrement the claim refcounter and possibly + * batadv_claim_put - decrement the claim refcounter and possibly * release it * @claim: claim to be free'd */ -static void batadv_claim_free_ref(struct batadv_bla_claim *claim) +static void batadv_claim_put(struct batadv_bla_claim *claim) { kref_put(&claim->refcount, batadv_claim_release); } @@ -305,7 +304,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_free_ref(claim); + batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); @@ -424,7 +423,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, netif_rx(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -486,7 +485,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } if (own_backbone) { @@ -524,7 +523,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, return; backbone_gw->lasttime = jiffies; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -573,7 +572,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -682,7 +681,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_free_ref(claim->backbone_gw); + batadv_backbone_gw_put(claim->backbone_gw); } /* set (new) backbone gw */ kref_get(&backbone_gw->refcount); @@ -694,7 +693,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, backbone_gw->lasttime = jiffies; claim_free_ref: - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } /** @@ -719,14 +718,14 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); - batadv_claim_free_ref(claim); /* reference from the hash is gone */ + batadv_claim_put(claim); /* reference from the hash is gone */ spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); /* don't need the reference from hash_find() anymore */ - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } /** @@ -783,7 +782,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } } - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } @@ -854,7 +853,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } @@ -891,7 +890,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, /* TODO: we could call something like tt_local_del() here. */ - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } @@ -965,7 +964,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, bla_dst_own->group = bla_dst->group; } - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 2; } @@ -1154,7 +1153,7 @@ purge_now: batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } @@ -1282,7 +1281,7 @@ void batadv_bla_status_update(struct net_device *net_dev) * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1356,7 +1355,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); @@ -1395,7 +1394,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } @@ -1571,7 +1570,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, if (!backbone_gw) return 0; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } @@ -1599,7 +1598,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) bat_priv->bla.backbone_hash = NULL; } if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1692,9 +1691,9 @@ handled: out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } @@ -1781,9 +1780,9 @@ handled: ret = 1; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } @@ -1839,7 +1838,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1904,6 +1903,6 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index e3261118130a..e96d7c745b4a 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -77,11 +77,11 @@ static void batadv_dat_entry_release(struct kref *ref) } /** - * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly + * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly * release it * @dat_entry: dat_entry to be free'd */ -static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) +static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) { kref_put(&dat_entry->refcount, batadv_dat_entry_release); } @@ -135,7 +135,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, continue; hlist_del_rcu(&dat_entry->hash_entry); - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } spin_unlock_bh(list_lock); } @@ -349,7 +349,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); goto out; } @@ -358,7 +358,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -547,7 +547,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, max = tmp_max; if (max_orig_node) - batadv_orig_node_free_ref(max_orig_node); + batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); @@ -654,9 +654,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, goto free_neigh; } - send_status = batadv_send_skb_packet(tmp_skb, - neigh_node->if_incoming, - neigh_node->addr); + send_status = batadv_send_unicast_skb(tmp_skb, neigh_node); if (send_status == NET_XMIT_SUCCESS) { /* count the sent packet */ switch (packet_subtype) { @@ -674,9 +672,9 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, ret = true; } free_neigh: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); free_orig: - batadv_orig_node_free_ref(cand[i].orig_node); + batadv_orig_node_put(cand[i].orig_node); } out: @@ -840,7 +838,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1029,7 +1027,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } @@ -1109,7 +1107,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; @@ -1262,6 +1260,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 55656e84bc7e..e6956d0746a2 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -378,16 +378,15 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, skb->len + ETH_HLEN); packet->ttl--; - batadv_send_skb_packet(skb, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = true; } out: if (orig_node_dst) - batadv_orig_node_free_ref(orig_node_dst); + batadv_orig_node_put(orig_node_dst); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -486,8 +485,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb_fragment->len + ETH_HLEN); - batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb_fragment, neigh_node); frag_header.no++; /* The initial check in this function should cover this case */ @@ -506,13 +504,13 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = true; out_err: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 261866e38502..c59aff5ccac8 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -71,16 +71,15 @@ static void batadv_gw_node_release(struct kref *ref) gw_node = container_of(ref, struct batadv_gw_node, refcount); - batadv_orig_node_free_ref(gw_node->orig_node); + batadv_orig_node_put(gw_node->orig_node); kfree_rcu(gw_node, rcu); } /** - * batadv_gw_node_free_ref - decrement the gw_node refcounter and possibly - * release it + * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it * @gw_node: gateway node to free */ -static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) +static void batadv_gw_node_put(struct batadv_gw_node *gw_node) { kref_put(&gw_node->refcount, batadv_gw_node_release); } @@ -125,7 +124,7 @@ unlock: rcu_read_unlock(); out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); return orig_node; } @@ -143,7 +142,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); if (curr_gw_node) - batadv_gw_node_free_ref(curr_gw_node); + batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -204,7 +203,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) ((tmp_gw_factor == max_gw_factor) && (tq_avg > max_tq))) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -219,7 +218,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) */ if (tq_avg > max_tq) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -232,12 +231,12 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); next: - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); @@ -273,7 +272,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } void batadv_gw_election(struct batadv_priv *bat_priv) @@ -348,13 +347,13 @@ void batadv_gw_election(struct batadv_priv *bat_priv) out: if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (next_gw) - batadv_gw_node_free_ref(next_gw); + batadv_gw_node_put(next_gw); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } void batadv_gw_check_election(struct batadv_priv *bat_priv, @@ -415,15 +414,15 @@ reselect: batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) - batadv_orig_node_free_ref(curr_gw_orig); + batadv_orig_node_put(curr_gw_orig); if (router_gw) - batadv_neigh_node_free_ref(router_gw); + batadv_neigh_node_put(router_gw); if (router_orig) - batadv_neigh_node_free_ref(router_orig); + batadv_neigh_node_put(router_orig); if (router_gw_tq) - batadv_neigh_ifinfo_free_ref(router_gw_tq); + batadv_neigh_ifinfo_put(router_gw_tq); if (router_orig_tq) - batadv_neigh_ifinfo_free_ref(router_orig_tq); + batadv_neigh_ifinfo_put(router_orig_tq); } /** @@ -446,7 +445,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return; } @@ -545,22 +544,23 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); - hlist_del_init_rcu(&gw_node->list); + if (!hlist_unhashed(&gw_node->list)) { + hlist_del_init_rcu(&gw_node->list); + batadv_gw_node_put(gw_node); + } spin_unlock_bh(&bat_priv->gw.list_lock); - batadv_gw_node_free_ref(gw_node); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } void batadv_gw_node_delete(struct batadv_priv *bat_priv, @@ -583,7 +583,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { hlist_del_init_rcu(&gw_node->list); - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -620,12 +620,12 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv, ret = seq_has_overflowed(seq) ? -1 : 0; if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); return ret; } @@ -662,7 +662,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -856,7 +856,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; - batadv_neigh_ifinfo_free_ref(curr_ifinfo); + batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: @@ -874,18 +874,18 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; - batadv_neigh_ifinfo_free_ref(old_ifinfo); + batadv_neigh_ifinfo_put(old_ifinfo); out: if (orig_dst_node) - batadv_orig_node_free_ref(orig_dst_node); + batadv_orig_node_put(orig_dst_node); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); if (neigh_old) - batadv_neigh_node_free_ref(neigh_old); + batadv_neigh_node_put(neigh_old); if (neigh_curr) - batadv_neigh_node_free_ref(neigh_curr); + batadv_neigh_node_put(neigh_curr); return out_of_range; } diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 5ee04f7140af..4423047889e1 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -40,8 +40,8 @@ * * Return: false on parse error and true otherwise. */ -static bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput) +bool batadv_parse_throughput(struct net_device *net_dev, char *buff, + const char *description, u32 *throughput) { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; u64 lthroughput; diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index b58346350024..8a5e1ddf1175 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -49,5 +49,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv); void batadv_gw_init(struct batadv_priv *bat_priv); void batadv_gw_free(struct batadv_priv *bat_priv); +bool batadv_parse_throughput(struct net_device *net_dev, char *buff, + const char *description, u32 *throughput); #endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index fb2d9c058ed0..b22b2775a0a5 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -84,6 +84,28 @@ out: } /** + * batadv_mutual_parents - check if two devices are each others parent + * @dev1: 1st net_device + * @dev2: 2nd net_device + * + * veth devices come in pairs and each is the parent of the other! + * + * Return: true if the devices are each others parent, otherwise false + */ +static bool batadv_mutual_parents(const struct net_device *dev1, + const struct net_device *dev2) +{ + int dev1_parent_iflink = dev_get_iflink(dev1); + int dev2_parent_iflink = dev_get_iflink(dev2); + + if (!dev1_parent_iflink || !dev2_parent_iflink) + return false; + + return (dev1_parent_iflink == dev2->ifindex) && + (dev2_parent_iflink == dev1->ifindex); +} + +/** * batadv_is_on_batman_iface - check if a device is a batman iface descendant * @net_dev: the device to check * @@ -116,6 +138,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) if (WARN(!parent_dev, "Cannot find parent device")) return false; + if (batadv_mutual_parents(net_dev, parent_dev)) + return false; + ret = batadv_is_on_batman_iface(parent_dev); return ret; @@ -201,7 +226,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, @@ -225,7 +250,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, out: if (curr_hard_iface) - batadv_hardif_free_ref(curr_hard_iface); + batadv_hardif_put(curr_hard_iface); } static bool @@ -384,7 +409,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void @@ -537,7 +562,7 @@ err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); err: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -568,7 +593,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_primary_if_select(bat_priv, new_if); if (new_if) - batadv_hardif_free_ref(new_if); + batadv_hardif_put(new_if); } bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); @@ -591,11 +616,11 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, } hard_iface->soft_iface = NULL; - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -614,7 +639,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work) batadv_debugfs_del_hardif(hard_iface); batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); } static struct batadv_hard_iface * @@ -769,10 +794,10 @@ static int batadv_hard_if_event(struct notifier_block *this, } hardif_put: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NOTIFY_DONE; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 5cecc6bc1b1e..d74f1983f33e 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -64,11 +64,11 @@ void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_release(struct kref *ref); /** - * batadv_hardif_free_ref - decrement the hard interface refcounter and possibly + * batadv_hardif_put - decrement the hard interface refcounter and possibly * release it * @hard_iface: the hard interface to free */ -static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) +static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { kref_put(&hard_iface->refcount, batadv_hardif_release); } diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index a69da37bbad5..14d0013b387e 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -278,7 +278,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr); - batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); goto out; dst_unreach: @@ -288,11 +288,11 @@ free_skb: kfree_skb(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return len; } diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 568c5503f637..d64ddb961979 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -87,6 +87,7 @@ static int __init batadv_init(void) batadv_recv_handler_init(); + batadv_v_init(); batadv_iv_init(); batadv_nc_init(); @@ -159,6 +160,10 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); + ret = batadv_v_mesh_init(bat_priv); + if (ret < 0) + goto err; + ret = batadv_originator_init(bat_priv); if (ret < 0) goto err; @@ -201,6 +206,8 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_purge_outstanding_packets(bat_priv, NULL); batadv_gw_node_free(bat_priv); + + batadv_v_mesh_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -287,7 +294,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq) seq_printf(seq, "BATMAN mesh %s disabled - primary interface not active\n", net_dev->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); primary_if = NULL; out: @@ -638,12 +645,11 @@ static void batadv_tvlv_handler_release(struct kref *ref) } /** - * batadv_tvlv_handler_free_ref - decrement the tvlv container refcounter and + * batadv_tvlv_handler_put - decrement the tvlv container refcounter and * possibly release it * @tvlv_handler: the tvlv handler to free */ -static void -batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) +static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } @@ -695,11 +701,11 @@ static void batadv_tvlv_container_release(struct kref *ref) } /** - * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and + * batadv_tvlv_container_put - decrement the tvlv container refcounter and * possibly release it * @tvlv: the tvlv container to free */ -static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) +static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { kref_put(&tvlv->refcount, batadv_tvlv_container_release); } @@ -785,8 +791,8 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ - batadv_tvlv_container_free_ref(tvlv); - batadv_tvlv_container_free_ref(tvlv); + batadv_tvlv_container_put(tvlv); + batadv_tvlv_container_put(tvlv); } /** @@ -1031,7 +1037,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, src, dst, tvlv_value, tvlv_value_cont_len); if (tvlv_handler) - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -1111,7 +1117,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); return; } @@ -1148,11 +1154,11 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, if (!tvlv_handler) return; - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); } /** @@ -1212,7 +1218,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) kfree_skb(skb); out: - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /** @@ -1262,7 +1268,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } return ap_isolation_enabled; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 32dfc9e578af..db4533631834 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,12 +24,13 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.0" +#define BATADV_SOURCE_VERSION "2016.1" #endif /* B.A.T.M.A.N. parameters */ #define BATADV_TQ_MAX_VALUE 255 +#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 /* Time To Live of broadcast messages */ @@ -60,6 +61,15 @@ #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1 +/* B.A.T.M.A.N. V */ +#define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */ +#define BATADV_ELP_PROBES_PER_NODE 2 +#define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */ +#define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */ +#define BATADV_ELP_MAX_AGE 64 +#define BATADV_OGM_MAX_ORIGDIFF 5 +#define BATADV_OGM_MAX_AGE 64 + /* number of OGMs sent with the last tt diff */ #define BATADV_TT_OGM_APPEND_MAX 3 @@ -100,11 +110,6 @@ */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 -/* how much worse secondary interfaces may be to be considered as bonding - * candidates - */ -#define BATADV_BONDING_TQ_THRESHOLD 50 - /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index a4eb8ee4abb1..b41719b6487a 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -218,16 +218,16 @@ static void batadv_nc_node_release(struct kref *ref) nc_node = container_of(ref, struct batadv_nc_node, refcount); - batadv_orig_node_free_ref(nc_node->orig_node); + batadv_orig_node_put(nc_node->orig_node); kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrement the nc_node refcounter and possibly + * batadv_nc_node_put - decrement the nc_node refcounter and possibly * release it * @nc_node: nc_node to be free'd */ -static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) +static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { kref_put(&nc_node->refcount, batadv_nc_node_release); } @@ -247,11 +247,11 @@ static void batadv_nc_path_release(struct kref *ref) } /** - * batadv_nc_path_free_ref - decrement the nc_path refcounter and possibly + * batadv_nc_path_put - decrement the nc_path refcounter and possibly * release it * @nc_path: nc_path to be free'd */ -static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) +static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { kref_put(&nc_path->refcount, batadv_nc_path_release); } @@ -263,7 +263,7 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) { kfree_skb(nc_packet->skb); - batadv_nc_path_free_ref(nc_packet->nc_path); + batadv_nc_path_put(nc_packet->nc_path); kfree(nc_packet); } @@ -356,7 +356,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, "Removing nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); list_del_rcu(&nc_node->list); - batadv_nc_node_free_ref(nc_node); + batadv_nc_node_put(nc_node); } spin_unlock_bh(lock); } @@ -467,7 +467,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, "Remove nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); } spin_unlock_bh(lock); } @@ -575,9 +575,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, */ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) { - batadv_send_skb_packet(nc_packet->skb, - nc_packet->neigh_node->if_incoming, - nc_packet->nc_path->next_hop); + batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet); } @@ -772,7 +770,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, last_ttl = orig_ifinfo->last_ttl; last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; @@ -942,9 +940,9 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, out: if (in_nc_node) - batadv_nc_node_free_ref(in_nc_node); + batadv_nc_node_put(in_nc_node); if (out_nc_node) - batadv_nc_node_free_ref(out_nc_node); + batadv_nc_node_put(out_nc_node); } /** @@ -1067,11 +1065,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; struct batadv_coded_packet *coded_packet; - struct batadv_neigh_node *neigh_tmp, *router_neigh; - struct batadv_neigh_node *router_coding = NULL; + struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; + struct batadv_neigh_node *router_coding = NULL, *second_dest; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - u8 *first_source, *first_dest, *second_source, *second_dest; + u8 *first_source, *second_source; __be32 packet_id1, packet_id2; size_t count; bool res = false; @@ -1114,9 +1112,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, */ if (tq_weighted_neigh >= tq_weighted_coding) { /* Destination from nc_packet is selected for MAC-header */ - first_dest = nc_packet->nc_path->next_hop; + first_dest = nc_packet->neigh_node; first_source = nc_packet->nc_path->prev_hop; - second_dest = neigh_node->addr; + second_dest = neigh_node; second_source = ethhdr->h_source; packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet2 = (struct batadv_unicast_packet *)skb->data; @@ -1125,9 +1123,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, skb->data + sizeof(*packet2)); } else { /* Destination for skb is selected for MAC-header */ - first_dest = neigh_node->addr; + first_dest = neigh_node; first_source = ethhdr->h_source; - second_dest = nc_packet->nc_path->next_hop; + second_dest = nc_packet->neigh_node; second_source = nc_packet->nc_path->prev_hop; packet1 = (struct batadv_unicast_packet *)skb->data; packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; @@ -1169,7 +1167,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, coded_packet->first_ttvn = packet1->ttvn; /* Info about second unicast packet */ - ether_addr_copy(coded_packet->second_dest, second_dest); + ether_addr_copy(coded_packet->second_dest, second_dest->addr); ether_addr_copy(coded_packet->second_source, second_source); ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); coded_packet->second_crc = packet_id2; @@ -1224,17 +1222,17 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, batadv_nc_packet_free(nc_packet); /* Send the coded packet and return true */ - batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest); + batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: if (router_neigh) - batadv_neigh_node_free_ref(router_neigh); + batadv_neigh_node_put(router_neigh); if (router_coding) - batadv_neigh_node_free_ref(router_coding); + batadv_neigh_node_put(router_coding); if (router_neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo); + batadv_neigh_ifinfo_put(router_neigh_ifinfo); if (router_coding_ifinfo) - batadv_neigh_ifinfo_free_ref(router_coding_ifinfo); + batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } @@ -1372,7 +1370,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return nc_packet; } @@ -1555,7 +1553,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb, return true; free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: /* Packet is not consumed */ return false; @@ -1617,7 +1615,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, free_skb: kfree_skb(skb); free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: return; } @@ -1950,7 +1948,7 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index eacd0e5a0238..e4cbb0753e37 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -151,11 +151,11 @@ static void batadv_orig_node_vlan_release(struct kref *ref) } /** - * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly - * release the originator-vlan object + * batadv_orig_node_vlan_put - decrement the refcounter and possibly release + * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan) +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } @@ -196,17 +196,17 @@ static void batadv_neigh_ifinfo_release(struct kref *ref) neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); + batadv_hardif_put(neigh_ifinfo->if_outgoing); kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release + * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } @@ -227,16 +227,16 @@ static void batadv_hardif_neigh_release(struct kref *ref) hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - batadv_hardif_free_ref(hardif_neigh->if_incoming); + batadv_hardif_put(hardif_neigh->if_incoming); kfree_rcu(hardif_neigh, rcu); } /** - * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter + * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ -void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) +void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } @@ -259,31 +259,31 @@ static void batadv_neigh_node_release(struct kref *ref) hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, neigh_node->addr); if (hardif_neigh) { /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_free_ref(hardif_neigh); - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); } if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); - batadv_hardif_free_ref(neigh_node->if_incoming); + batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); } /** - * batadv_neigh_node_free_ref - decrement the neighbors refcounter and possibly + * batadv_neigh_node_put - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { kref_put(&neigh_node->refcount, batadv_neigh_node_release); } @@ -544,7 +544,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); if (!hardif_neigh) { - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); goto out; } @@ -681,7 +681,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, out: if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } @@ -707,7 +707,7 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_neigh_print) { seq_puts(seq, @@ -732,22 +732,22 @@ static void batadv_orig_ifinfo_release(struct kref *ref) orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref(orig_ifinfo->if_outgoing); + batadv_hardif_put(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release + * batadv_orig_ifinfo_put - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } @@ -793,13 +793,13 @@ static void batadv_orig_node_release(struct kref *ref) hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); @@ -810,11 +810,11 @@ static void batadv_orig_node_release(struct kref *ref) } /** - * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly + * batadv_orig_node_put - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) +void batadv_orig_node_put(struct batadv_orig_node *orig_node) { kref_put(&orig_node->refcount, batadv_orig_node_release); } @@ -843,7 +843,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { hlist_del_rcu(&orig_node->hash_entry); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } spin_unlock_bh(list_lock); } @@ -917,7 +917,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, * Immediately release vlan since it is not needed anymore in this * context */ - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { INIT_HLIST_HEAD(&orig_node->fragments[i].head); @@ -966,7 +966,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, neigh->addr, if_outgoing->net_dev->name); hlist_del_rcu(&neigh_ifinfo->list); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } spin_unlock_bh(&neigh->ifinfo_lock); @@ -1012,10 +1012,10 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, ifinfo_purged = true; hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (orig_node->last_bonding_candidate == orig_ifinfo) { orig_node->last_bonding_candidate = NULL; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } } @@ -1069,7 +1069,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, neigh_purged = true; hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); } else { /* only necessary if not the whole neighbor is to be * deleted, but some interface has been removed. @@ -1108,7 +1108,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, continue; if (best) - batadv_neigh_node_free_ref(best); + batadv_neigh_node_put(best); best = neigh; } @@ -1154,7 +1154,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); @@ -1171,7 +1171,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); } rcu_read_unlock(); @@ -1204,7 +1204,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, -1, "originator timed out"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); continue; } @@ -1250,7 +1250,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_orig_print) { seq_puts(seq, @@ -1306,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return 0; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 99507408f4cf..4e8b67f11051 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -37,19 +37,19 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); +void batadv_orig_node_put(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void -batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh); +batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); @@ -59,7 +59,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo); +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); @@ -69,7 +69,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo); +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); @@ -83,7 +83,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); /* hashfunction to choose an entry in a hash table of given size * hash algorithm from http://en.wikipedia.org/wiki/Hash_table diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index e7f915181aba..8a8d7ca1a5cf 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -26,6 +26,8 @@ * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV * @BATADV_BCAST: broadcast packets carrying broadcast payload * @BATADV_CODED: network coded packets + * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V + * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V * * @BATADV_UNICAST: unicast packets carrying unicast payload traffic * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original @@ -40,6 +42,8 @@ enum batadv_packettype { BATADV_IV_OGM = 0x00, BATADV_BCAST = 0x01, BATADV_CODED = 0x02, + BATADV_ELP = 0x03, + BATADV_OGM2 = 0x04, /* 0x40 - 0x7f: unicast */ #define BATADV_UNICAST_MIN 0x40 BATADV_UNICAST = 0x40, @@ -235,6 +239,51 @@ struct batadv_ogm_packet { #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) /** + * struct batadv_ogm2_packet - ogm2 (routing protocol) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @flags: reseved for routing relevant flags - currently always 0 + * @seqno: sequence number + * @orig: originator mac address + * @tvlv_len: length of the appended tvlv buffer (in bytes) + * @throughput: the currently flooded path throughput + */ +struct batadv_ogm2_packet { + u8 packet_type; + u8 version; + u8 ttl; + u8 flags; + __be32 seqno; + u8 orig[ETH_ALEN]; + __be16 tvlv_len; + __be32 throughput; + /* __packed is not needed as the struct size is divisible by 4, + * and the largest data type in this struct has a size of 4. + */ +}; + +#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) + +/** + * struct batadv_elp_packet - elp (neighbor discovery) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @orig: originator mac address + * @seqno: sequence number + * @elp_interval: currently used ELP sending interval in ms + */ +struct batadv_elp_packet { + u8 packet_type; + u8 version; + u8 orig[ETH_ALEN]; + __be32 seqno; + __be32 elp_interval; +}; + +#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) + +/** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 205310b56c2b..4dd646a52f1a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -98,7 +98,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); /* increase refcount of new best neighbor */ if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) @@ -107,11 +107,11 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->neigh_list_lock); rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* decrease refcount of previous best neighbor */ if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); } /** @@ -138,7 +138,7 @@ void batadv_update_route(struct batadv_priv *bat_priv, out: if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); } /** @@ -269,9 +269,9 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -317,9 +317,9 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -403,7 +403,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -545,16 +545,16 @@ batadv_find_router(struct batadv_priv *bat_priv, next: /* free references */ if (cand_router) { - batadv_neigh_node_free_ref(cand_router); + batadv_neigh_node_put(cand_router); cand_router = NULL; } - batadv_orig_ifinfo_free_ref(cand); + batadv_orig_ifinfo_put(cand); } rcu_read_unlock(); /* last_bonding_candidate is reset below, remove the old reference. */ if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate); + batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that @@ -562,17 +562,17 @@ next: * 3) there is no candidate at all, return the default router */ if (next_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* remove references to first candidate, we don't need it. */ if (first_candidate) { - batadv_neigh_node_free_ref(first_candidate_router); - batadv_orig_ifinfo_free_ref(first_candidate); + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); } router = next_candidate_router; orig_node->last_bonding_candidate = next_candidate; } else if (first_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* refcounting has already been done in the loop above. */ router = first_candidate_router; @@ -649,7 +649,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -702,9 +702,9 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -768,7 +768,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, return 0; curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /* check if the TTVN contained in the packet is fresher than what the @@ -808,7 +808,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); unicast_packet->ttvn = curr_ttvn; @@ -908,7 +908,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, rx_success: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -1019,7 +1019,7 @@ int batadv_recv_frag_packet(struct sk_buff *skb, out: if (orig_node_src) - batadv_orig_node_free_ref(orig_node_src); + batadv_orig_node_put(orig_node_src); return ret; } @@ -1124,6 +1124,6 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index d8b03fd604e0..3ce06e0a91b1 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -49,16 +49,30 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work); -/* send out an already prepared packet to the given address via the - * specified batman interface +/** + * batadv_send_skb_packet - send an already prepared packet + * @skb: the packet to send + * @hard_iface: the interface to use to send the broadcast packet + * @dst_addr: the payload destination + * + * Send out an already prepared packet to the given neighbor or broadcast it + * using the specified interface. Either hard_iface or neigh_node must be not + * NULL. + * If neigh_node is NULL, then the packet is broadcasted using hard_iface, + * otherwise it is sent as unicast to the given neighbor. + * + * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb) + * otherwise */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv; struct ethhdr *ethhdr; + bat_priv = netdev_priv(hard_iface->soft_iface); + if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -100,6 +114,35 @@ send_skb_err: return NET_XMIT_DROP; } +int batadv_send_broadcast_skb(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); +} + +int batadv_send_unicast_skb(struct sk_buff *skb, + struct batadv_neigh_node *neigh) +{ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hardif_neigh_node *hardif_neigh; +#endif + int ret; + + ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr); + +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); + + if ((hardif_neigh) && (ret != NET_XMIT_DROP)) + hardif_neigh->bat_v.last_unicast_tx = jiffies; + + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +#endif + + return ret; +} + /** * batadv_send_skb_to_orig - Lookup next-hop and transmit skb. * @skb: Packet to be transmitted. @@ -146,14 +189,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { ret = NET_XMIT_POLICED; } else { - batadv_send_skb_packet(skb, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = NET_XMIT_SUCCESS; } out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -246,7 +288,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } @@ -317,7 +359,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (ret == NET_XMIT_DROP) kfree_skb(skb); return ret; @@ -409,9 +451,9 @@ static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) - batadv_hardif_free_ref(forw_packet->if_incoming); + batadv_hardif_put(forw_packet->if_incoming); if (forw_packet->if_outgoing) - batadv_hardif_free_ref(forw_packet->if_outgoing); + batadv_hardif_put(forw_packet->if_outgoing); kfree(forw_packet); } @@ -497,7 +539,7 @@ out_and_inc: atomic_inc(&bat_priv->bcast_queue_left); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_BUSY; } @@ -538,8 +580,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb1) - batadv_send_skb_packet(skb1, hard_iface, - batadv_broadcast_addr); + batadv_send_broadcast_skb(skb1, hard_iface); } rcu_read_unlock(); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 7ff95cada2e7..6fd7270d8ce6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -28,12 +28,16 @@ struct sk_buff; struct work_struct; -int batadv_send_skb_packet(struct sk_buff *skb, - struct batadv_hard_iface *hard_iface, - const u8 *dst_addr); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); +int batadv_send_skb_packet(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface, + const u8 *dst_addr); +int batadv_send_broadcast_skb(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface); +int batadv_send_unicast_skb(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node); void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface); int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d4490ff75edd..0710379491bf 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -377,7 +377,7 @@ dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_OK; } @@ -497,11 +497,11 @@ static void batadv_softif_vlan_release(struct kref *ref) } /** - * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and + * batadv_softif_vlan_put - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) +void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; @@ -552,7 +552,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return -EEXIST; } @@ -601,7 +601,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, vlan->vid, "vlan interface destroyed", false); batadv_sysfs_del_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** @@ -646,7 +646,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, if (!vlan->kobj) { ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (ret) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return ret; } } @@ -693,7 +693,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return 0; } @@ -749,7 +749,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } batadv_sysfs_del_meshif(soft_iface); @@ -878,7 +878,7 @@ static int batadv_softif_slave_add(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -905,7 +905,7 @@ static int batadv_softif_slave_del(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index d17cfbacf809..9ae265703d23 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -34,7 +34,7 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface); int batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan); +void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index ab4382ba3855..e7cf51333a36 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -216,7 +216,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ attr, &vlan->_name, \ bat_priv->soft_iface); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -231,7 +231,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ atomic_read(&vlan->_name) == 0 ? \ "disabled" : "enabled"); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -242,6 +242,55 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \ batadv_store_vlan_##_name) +#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ +ssize_t batadv_store_##_name(struct kobject *kobj, \ + struct attribute *attr, char *buff, \ + size_t count) \ +{ \ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ + struct batadv_hard_iface *hard_iface; \ + ssize_t length; \ + \ + hard_iface = batadv_hardif_get_by_netdev(net_dev); \ + if (!hard_iface) \ + return 0; \ + \ + length = __batadv_store_uint_attr(buff, count, _min, _max, \ + _post_func, attr, \ + &hard_iface->_var, net_dev); \ + \ + batadv_hardif_put(hard_iface); \ + return length; \ +} + +#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ +ssize_t batadv_show_##_name(struct kobject *kobj, \ + struct attribute *attr, char *buff) \ +{ \ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ + struct batadv_hard_iface *hard_iface; \ + ssize_t length; \ + \ + hard_iface = batadv_hardif_get_by_netdev(net_dev); \ + if (!hard_iface) \ + return 0; \ + \ + length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \ + \ + batadv_hardif_put(hard_iface); \ + return length; \ +} + +/* Use this, if you are going to set [name] in hard_iface to an + * unsigned integer value + */ +#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ + static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \ + _max, _post_func) \ + static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ + static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ + batadv_store_##_name) + static int batadv_store_bool_attr(char *buff, size_t count, struct net_device *net_dev, const char *attr_name, atomic_t *attr, @@ -771,7 +820,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, length = sprintf(buff, "%s\n", ifname); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } @@ -795,7 +844,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, if (strlen(buff) >= IFNAMSIZ) { pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", buff); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return -EINVAL; } @@ -829,7 +878,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, unlock: rtnl_unlock(); out: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -863,18 +912,99 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj, break; } - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +/** + * batadv_store_throughput_override - parse and store throughput override + * entered by the user + * @kobj: kobject representing the private mesh sysfs directory + * @attr: the batman-adv attribute the user is interacting with + * @buff: the buffer containing the user data + * @count: number of bytes in the buffer + * + * Return: 'count' on success or a negative error code in case of failure + */ +static ssize_t batadv_store_throughput_override(struct kobject *kobj, + struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_hard_iface *hard_iface; + u32 tp_override; + u32 old_tp_override; + bool ret; + + hard_iface = batadv_hardif_get_by_netdev(net_dev); + if (!hard_iface) + return -EINVAL; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + ret = batadv_parse_throughput(net_dev, buff, "throughput_override", + &tp_override); + if (!ret) + return count; + + old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override); + if (old_tp_override == tp_override) + goto out; + + batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n", + "throughput_override", + old_tp_override / 10, old_tp_override % 10, + tp_override / 10, tp_override % 10); + + atomic_set(&hard_iface->bat_v.throughput_override, tp_override); + +out: + batadv_hardif_put(hard_iface); + return count; +} + +static ssize_t batadv_show_throughput_override(struct kobject *kobj, + struct attribute *attr, + char *buff) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_hard_iface *hard_iface; + u32 tp_override; + + hard_iface = batadv_hardif_get_by_netdev(net_dev); + if (!hard_iface) + return -EINVAL; + + tp_override = atomic_read(&hard_iface->bat_v.throughput_override); + + return sprintf(buff, "%u.%u MBit\n", tp_override / 10, + tp_override % 10); +} + +#endif + static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface, batadv_store_mesh_iface); static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL); +#ifdef CONFIG_BATMAN_ADV_BATMAN_V +BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR, + 2 * BATADV_JITTER, INT_MAX, NULL); +static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR, + batadv_show_throughput_override, + batadv_store_throughput_override); +#endif static struct batadv_attribute *batadv_batman_attrs[] = { &batadv_attr_mesh_iface, &batadv_attr_iface_status, +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + &batadv_attr_elp_interval, + &batadv_attr_throughput_override, +#endif NULL, }; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 11882793f0a4..0b43e86328a5 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -219,12 +219,12 @@ static void batadv_tt_local_entry_release(struct kref *ref) } /** - * batadv_tt_local_entry_free_ref - decrement the tt_local_entry refcounter and + * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and * possibly release it * @tt_local_entry: tt_local_entry to be free'd */ static void -batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry) +batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); @@ -247,12 +247,12 @@ static void batadv_tt_global_entry_release(struct kref *ref) } /** - * batadv_tt_global_entry_free_ref - decrement the tt_global_entry refcounter - * and possibly release it + * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and + * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static void -batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) +batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); @@ -278,7 +278,7 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return 0; count = atomic_read(&tt_global_entry->orig_list_count); - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); return count; } @@ -301,7 +301,7 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, atomic_add(v, &vlan->tt.num_entries); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** @@ -346,12 +346,14 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_init_rcu(&vlan->list); + if (!hlist_unhashed(&vlan->list)) { + hlist_del_init_rcu(&vlan->list); + batadv_orig_node_vlan_put(vlan); + } spin_unlock_bh(&orig_node->vlan_list_lock); - batadv_orig_node_vlan_free_ref(vlan); } - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } /** @@ -390,17 +392,17 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, refcount); - batadv_orig_node_free_ref(orig_entry->orig_node); + batadv_orig_node_put(orig_entry->orig_node); kfree_rcu(orig_entry, rcu); } /** - * batadv_tt_orig_list_entry_free_ref - decrement the tt orig entry refcounter - * and possibly release it + * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and + * possibly release it * @orig_entry: tt orig entry to be free'd */ static void -batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) +batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } @@ -559,7 +561,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } /** @@ -685,8 +687,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_local_entry_free_ref(tt_local); - batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_put(tt_local); + batadv_softif_vlan_put(vlan); goto out; } @@ -752,9 +754,9 @@ out: if (in_dev) dev_put(in_dev); if (tt_local) - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); if (tt_global) - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); return ret; } @@ -1052,13 +1054,13 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) no_purge ? 0 : last_seen_msecs, vlan->tt.crc); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } rcu_read_unlock(); } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1135,19 +1137,19 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, goto out; /* extra call to free the local tt entry */ - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) goto out; - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } @@ -1243,11 +1245,11 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) vlan = batadv_softif_vlan_get(bat_priv, tt_common_entry->vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); } - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -1343,7 +1345,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } return found; @@ -1384,7 +1386,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, out: if (orig_entry) - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /** @@ -1465,7 +1467,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { @@ -1551,9 +1553,9 @@ out_remove: out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -1584,20 +1586,20 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, if (best_router && bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); return best_entry; } @@ -1650,7 +1652,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } print_list: @@ -1682,7 +1684,7 @@ print_list: ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } } @@ -1723,7 +1725,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1751,7 +1753,7 @@ _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, * being part of a list */ hlist_del_rcu(&orig_entry->list); - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ @@ -1907,9 +1909,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (local_entry) - batadv_tt_local_entry_free_ref(local_entry); + batadv_tt_local_entry_put(local_entry); } /** @@ -1963,7 +1965,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, tt_global->common.addr, BATADV_PRINT_VID(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); @@ -2026,7 +2028,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) hlist_del_rcu(&tt_common->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -2058,7 +2060,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -2139,9 +2141,9 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return orig_node; } @@ -2501,7 +2503,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, return false; crc = vlan->tt.crc; - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; @@ -2636,7 +2638,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); /* hlist_del_init() verifies tt_req_node still is in the list */ @@ -2774,9 +2776,9 @@ unlock: out: if (res_dst_orig_node) - batadv_orig_node_free_ref(res_dst_orig_node); + batadv_orig_node_put(res_dst_orig_node); if (req_dst_orig_node) - batadv_orig_node_free_ref(req_dst_orig_node); + batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } @@ -2891,9 +2893,9 @@ unlock: out: spin_unlock_bh(&bat_priv->tt.commit_lock); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; @@ -2979,7 +2981,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, @@ -3021,7 +3023,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, ret = true; out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3085,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->tt.req_list_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) @@ -3216,7 +3218,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) @@ -3340,11 +3342,11 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); } - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -3427,11 +3429,11 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, ret = true; out: - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3541,7 +3543,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); out: return ret; } @@ -3567,7 +3569,7 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); out: return ret; } @@ -3800,7 +3802,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -3861,7 +3863,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; - batadv_tt_global_entry_free_ref(tt); + batadv_tt_global_entry_put(tt); return ret; } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 612de23178e6..9abfb3e73c34 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -22,6 +22,7 @@ #error only "main.h" can be included directly #endif +#include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/if_ether.h> @@ -86,6 +87,36 @@ struct batadv_hard_iface_bat_iv { }; /** + * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V + * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex + * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no + * throughput data is available for this interface and that default values are + * assumed. + */ +enum batadv_v_hard_iface_flags { + BATADV_FULL_DUPLEX = BIT(0), + BATADV_WARNING_DEFAULT = BIT(1), +}; + +/** + * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data + * @elp_interval: time interval between two ELP transmissions + * @elp_seqno: current ELP sequence number + * @elp_skb: base skb containing the ELP message to send + * @elp_wq: workqueue used to schedule ELP transmissions + * @throughput_override: throughput override to disable link auto-detection + * @flags: interface specific flags + */ +struct batadv_hard_iface_bat_v { + atomic_t elp_interval; + atomic_t elp_seqno; + struct sk_buff *elp_skb; + struct delayed_work elp_wq; + atomic_t throughput_override; + u8 flags; +}; + +/** * struct batadv_hard_iface - network device known to batman-adv * @list: list node for batadv_hardif_list * @if_num: identificator of the interface @@ -99,6 +130,7 @@ struct batadv_hard_iface_bat_iv { * @soft_iface: the batman-adv interface which uses this network interface * @rcu: struct used for freeing in an RCU-safe manner * @bat_iv: per hard-interface B.A.T.M.A.N. IV data + * @bat_v: per hard-interface B.A.T.M.A.N. V data * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface @@ -116,6 +148,9 @@ struct batadv_hard_iface { struct net_device *soft_iface; struct rcu_head rcu; struct batadv_hard_iface_bat_iv bat_iv; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hard_iface_bat_v bat_v; +#endif struct work_struct cleanup_work; struct dentry *debug_dir; struct hlist_head neigh_list; @@ -130,6 +165,7 @@ struct batadv_hard_iface { * @router: router that should be used to reach this originator * @last_real_seqno: last and best known sequence number * @last_ttl: ttl of last received packet + * @last_seqno_forwarded: seqno of the OGM which was forwarded last * @batman_seqno_reset: time when the batman seqno window was reset * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -140,6 +176,7 @@ struct batadv_orig_ifinfo { struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ u32 last_real_seqno; u8 last_ttl; + u32 last_seqno_forwarded; unsigned long batman_seqno_reset; struct kref refcount; struct rcu_head rcu; @@ -346,12 +383,32 @@ struct batadv_gw_node { struct rcu_head rcu; }; +DECLARE_EWMA(throughput, 1024, 8) + +/** + * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor + * information + * @throughput: ewma link throughput towards this neighbor + * @elp_interval: time interval between two ELP transmissions + * @elp_latest_seqno: latest and best known ELP sequence number + * @last_unicast_tx: when the last unicast packet has been sent to this neighbor + * @metric_work: work queue callback item for metric update + */ +struct batadv_hardif_neigh_node_bat_v { + struct ewma_throughput throughput; + u32 elp_interval; + u32 elp_latest_seqno; + unsigned long last_unicast_tx; + struct work_struct metric_work; +}; + /** * struct batadv_hardif_neigh_node - unique neighbor per hard-interface * @list: list node for batadv_hard_iface::neigh_list * @addr: the MAC address of the neighboring interface * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received + * @bat_v: B.A.T.M.A.N. V private data * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner */ @@ -360,6 +417,9 @@ struct batadv_hardif_neigh_node { u8 addr[ETH_ALEN]; struct batadv_hard_iface *if_incoming; unsigned long last_seen; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hardif_neigh_node_bat_v bat_v; +#endif struct kref refcount; struct rcu_head rcu; }; @@ -407,10 +467,22 @@ struct batadv_neigh_ifinfo_bat_iv { }; /** + * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing + * interface for B.A.T.M.A.N. V + * @throughput: last throughput metric received from originator via this neigh + * @last_seqno: last sequence number known for this neighbor + */ +struct batadv_neigh_ifinfo_bat_v { + u32 throughput; + u32 last_seqno; +}; + +/** * struct batadv_neigh_ifinfo - neighbor information per outgoing interface * @list: list node for batadv_neigh_node::ifinfo_list * @if_outgoing: pointer to outgoing hard-interface * @bat_iv: B.A.T.M.A.N. IV private structure + * @bat_v: B.A.T.M.A.N. V private data * @last_ttl: last received ttl from this neigh node * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner @@ -419,6 +491,9 @@ struct batadv_neigh_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_neigh_ifinfo_bat_v bat_v; +#endif u8 last_ttl; struct kref refcount; struct rcu_head rcu; @@ -751,6 +826,20 @@ struct batadv_softif_vlan { }; /** + * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data + * @ogm_buff: buffer holding the OGM packet + * @ogm_buff_len: length of the OGM packet buffer + * @ogm_seqno: OGM sequence number - used to identify each OGM + * @ogm_wq: workqueue used to schedule OGM transmissions + */ +struct batadv_priv_bat_v { + unsigned char *ogm_buff; + int ogm_buff_len; + atomic_t ogm_seqno; + struct delayed_work ogm_wq; +}; + +/** * struct batadv_priv - per mesh interface data * @mesh_state: current status of the mesh (inactive/active/deactivating) * @soft_iface: net device which holds this struct as private data @@ -804,6 +893,7 @@ struct batadv_softif_vlan { * @mcast: multicast data * @network_coding: bool indicating whether network coding is enabled * @nc: network coding data + * @bat_v: B.A.T.M.A.N. V per soft-interface private data */ struct batadv_priv { atomic_t mesh_state; @@ -869,6 +959,9 @@ struct batadv_priv { atomic_t network_coding; struct batadv_priv_nc nc; #endif /* CONFIG_BATMAN_ADV_NC */ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_priv_bat_v bat_v; +#endif }; /** diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 95d1a66ba03a..06c31b9a68b0 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -69,6 +69,15 @@ config BT_6LOWPAN help IPv6 compression over Bluetooth Low Energy. +config BT_LEDS + bool "Enable LED triggers" + depends on BT + depends on LEDS_CLASS + select LEDS_TRIGGERS + help + This option selects a few LED triggers for different + Bluetooth events. + config BT_SELFTEST bool "Bluetooth self testing support" depends on BT && DEBUG_KERNEL diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 2b15ae8c1def..b3ff12eb9b6d 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -17,6 +17,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o +bluetooth-$(CONFIG_BT_LEDS) += leds.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 47bcef754796..2713fc86e85a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -40,6 +40,7 @@ #include "hci_request.h" #include "hci_debugfs.h" #include "smp.h" +#include "leds.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1395,6 +1396,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); + hci_leds_update_powered(hdev, true); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && @@ -1532,6 +1534,8 @@ int hci_dev_do_close(struct hci_dev *hdev) return 0; } + hci_leds_update_powered(hdev, false); + /* Flush RX and TX works */ flush_work(&hdev->tx_work); flush_work(&hdev->rx_work); @@ -2017,6 +2021,7 @@ static void hci_power_on(struct work_struct *work) if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { + cancel_delayed_work(&hdev->power_off); hci_req_sync_lock(hdev); err = __hci_req_hci_power_on(hdev); hci_req_sync_unlock(hdev); @@ -3067,6 +3072,8 @@ int hci_register_dev(struct hci_dev *hdev) if (error < 0) goto err_wqueue; + hci_leds_init(hdev); + hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); @@ -4112,8 +4119,10 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, break; } - *req_complete = bt_cb(skb)->hci.req_complete; - *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) + *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + else + *req_complete = bt_cb(skb)->hci.req_complete; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c new file mode 100644 index 000000000000..8319c8440c89 --- /dev/null +++ b/net/bluetooth/leds.c @@ -0,0 +1,74 @@ +/* + * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> + +#include "leds.h" + +struct hci_basic_led_trigger { + struct led_trigger led_trigger; + struct hci_dev *hdev; +}; + +#define to_hci_basic_led_trigger(arg) container_of(arg, \ + struct hci_basic_led_trigger, led_trigger) + +void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) +{ + if (hdev->power_led) + led_trigger_event(hdev->power_led, + enabled ? LED_FULL : LED_OFF); +} + +static void power_activate(struct led_classdev *led_cdev) +{ + struct hci_basic_led_trigger *htrig; + bool powered; + + htrig = to_hci_basic_led_trigger(led_cdev->trigger); + powered = test_bit(HCI_UP, &htrig->hdev->flags); + + led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF); +} + +static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, + void (*activate)(struct led_classdev *led_cdev), + const char *name) +{ + struct hci_basic_led_trigger *htrig; + + htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL); + if (!htrig) + return NULL; + + htrig->hdev = hdev; + htrig->led_trigger.activate = activate; + htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL, + "%s-%s", hdev->name, + name); + if (!htrig->led_trigger.name) + goto err_alloc; + + if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger)) + goto err_register; + + return &htrig->led_trigger; + +err_register: + devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name); +err_alloc: + devm_kfree(&hdev->dev, htrig); + return NULL; +} + +void hci_leds_init(struct hci_dev *hdev) +{ + /* initialize power_led */ + hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); +} diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h new file mode 100644 index 000000000000..a9c4d6ea01cf --- /dev/null +++ b/net/bluetooth/leds.h @@ -0,0 +1,16 @@ +/* + * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#if IS_ENABLED(CONFIG_BT_LEDS) +void hci_leds_update_powered(struct hci_dev *hdev, bool enabled); +void hci_leds_init(struct hci_dev *hdev); +#else +static inline void hci_leds_update_powered(struct hci_dev *hdev, + bool enabled) {} +static inline void hci_leds_init(struct hci_dev *hdev) {} +#endif diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82e3e9705017..dcea4f4c62b3 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb, struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { + int err; + if (idx < cb->args[0]) goto skip; @@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb, if (!filter_dev && f->dst) goto skip; - if (fdb_fill_info(skb, br, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI) < 0) + err = fdb_fill_info(skb, br, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI); + if (err < 0) { + cb->args[1] = err; break; + } skip: ++idx; } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index fcdb86dd5a23..f47759f05b6d 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -44,7 +44,6 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_push(skb, ETH_HLEN); br_drop_fake_rtable(skb); - skb_sender_cpu_clear(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && (skb->protocol == htons(ETH_P_8021Q) || diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c367b3e1b5ac..a73df3315df9 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -36,10 +36,10 @@ */ static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; - if (!__ethtool_get_settings(dev, &ecmd)) { - switch (ethtool_cmd_speed(&ecmd)) { + if (!__ethtool_get_link_ksettings(dev, &ecmd)) { + switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_1000: @@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head) destroy_nbp(p); } +static unsigned get_max_headroom(struct net_bridge *br) +{ + unsigned max_headroom = 0; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); + + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + + return max_headroom; +} + +static void update_headroom(struct net_bridge *br, int new_hr) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) + netdev_set_rx_headroom(p->dev, new_hr); + + br->dev->needed_headroom = new_hr; +} + /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. @@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); list_del_rcu(&p->list); + if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) + update_headroom(br, get_max_headroom(br)); + netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); @@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + unsigned br_hr, dev_hr; bool changed_addr; /* Don't allow bridging non-ethernet like devices, or DSA-enabled @@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) netdev_update_features(br->dev); - if (br->dev->needed_headroom < dev->needed_headroom) - br->dev->needed_headroom = dev->needed_headroom; + br_hr = br->dev->needed_headroom; + dev_hr = netdev_get_fwd_headroom(dev); + if (br_hr < dev_hr) + update_headroom(br, dev_hr); + else + netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index ac089286526e..253bc77eda3b 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; - struct nlattr *nest; + struct nlattr *nest, *port_nest; if (!br->multicast_router || hlist_empty(&br->router_list)) return 0; @@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, return -EMSGSIZE; hlist_for_each_entry_rcu(p, &br->router_list, rlist) { - if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex)) + if (!p) + continue; + port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || + nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, + br_timer_value(&p->multicast_router_timer)) || + nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, + p->multicast_router)) { + nla_nest_cancel(skb, port_nest); goto fail; + } + nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); @@ -88,26 +100,41 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + port = p->port; - if (port) { - struct br_mdb_entry e; - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.vid = p->addr.vid; - __mdb_entry_fill_flags(&e, p->flags); - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, + MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } + nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest2); skip: @@ -437,8 +464,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, mp = br_mdb_ip_get(mdb, group); if (!mp) { mp = br_multicast_new_group(br, port, group); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + err = PTR_ERR_OR_ZERO(mp); + if (err) return err; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8b6e4249be1b..a4c15df2b792 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -759,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data) struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); - if (port->multicast_router != 1 || + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(&port->multicast_router_timer) || hlist_unhashed(&port->rlist)) goto out; hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if the router expired */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; out: spin_unlock(&br->multicast_lock); @@ -912,7 +916,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) void br_multicast_add_port(struct net_bridge_port *port) { - port->multicast_router = 1; + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); @@ -959,7 +963,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + if (port->multicast_router == MDB_RTR_TYPE_PERM && + hlist_unhashed(&port->rlist)) br_multicast_add_router(br, port); out: @@ -980,6 +985,9 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (!hlist_unhashed(&port->rlist)) { hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if disabling */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); @@ -1227,13 +1235,14 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == 1) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); return; } - if (port->multicast_router != 1) + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(br, port); @@ -1713,7 +1722,7 @@ void br_multicast_init(struct net_bridge *br) br->hash_elasticity = 4; br->hash_max = 512; - br->multicast_router = 1; + br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_querier = 0; br->multicast_query_use_ifaddr = 0; br->multicast_last_member_count = 2; @@ -1823,11 +1832,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) spin_lock_bh(&br->multicast_lock); switch (val) { - case 0: - case 2: + case MDB_RTR_TYPE_DISABLED: + case MDB_RTR_TYPE_PERM: del_timer(&br->multicast_router_timer); /* fall through */ - case 1: + case MDB_RTR_TYPE_TEMP_QUERY: br->multicast_router = val; err = 0; break; @@ -1838,37 +1847,53 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) return err; } +static void __del_port_router(struct net_bridge_port *p) +{ + if (hlist_unhashed(&p->rlist)) + return; + hlist_del_init_rcu(&p->rlist); + br_rtr_notify(p->br->dev, p, RTM_DELMDB); +} + int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; + unsigned long now = jiffies; int err = -EINVAL; spin_lock(&br->multicast_lock); - - switch (val) { - case 0: - case 1: - case 2: - p->multicast_router = val; + if (p->multicast_router == val) { + /* Refresh the temp router port timer */ + if (p->multicast_router == MDB_RTR_TYPE_TEMP) + mod_timer(&p->multicast_router_timer, + now + br->multicast_querier_interval); err = 0; - - if (val < 2 && !hlist_unhashed(&p->rlist)) { - hlist_del_init_rcu(&p->rlist); - br_rtr_notify(br->dev, p, RTM_DELMDB); - } - - if (val == 1) - break; - + goto unlock; + } + switch (val) { + case MDB_RTR_TYPE_DISABLED: + p->multicast_router = MDB_RTR_TYPE_DISABLED; + __del_port_router(p); + del_timer(&p->multicast_router_timer); + break; + case MDB_RTR_TYPE_TEMP_QUERY: + p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + __del_port_router(p); + break; + case MDB_RTR_TYPE_PERM: + p->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&p->multicast_router_timer); - - if (val == 0) - break; - br_multicast_add_router(br, p); break; + case MDB_RTR_TYPE_TEMP: + p->multicast_router = MDB_RTR_TYPE_TEMP; + br_multicast_mark_router(br, p); + break; + default: + goto unlock; } - + err = 0; +unlock: spin_unlock(&br->multicast_lock); return err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7ddbe7ec81d6..44114a94c576 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -37,6 +37,7 @@ #include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> +#include <net/netns/generic.h> #include <asm/uaccess.h> #include "br_private.h" @@ -44,6 +45,12 @@ #include <linux/sysctl.h> #endif +static int brnf_net_id __read_mostly; + +struct brnf_net { + bool enabled; +}; + #ifdef CONFIG_SYSCTL static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; @@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { }, }; +static int brnf_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct brnf_net *brnet; + struct net *net; + int ret; + + if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) + return NOTIFY_DONE; + + ASSERT_RTNL(); + + net = dev_net(dev); + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) + return NOTIFY_OK; + + ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret) + return NOTIFY_BAD; + + brnet->enabled = true; + return NOTIFY_OK; +} + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + if (!brnet->enabled) + return; + + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; +} + +static struct pernet_operations brnf_net_ops __read_mostly = { + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; + +static struct notifier_block brnf_notifier __read_mostly = { + .notifier_call = brnf_device_event, +}; + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, @@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void) { int ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; + ret = register_netdevice_notifier(&brnf_notifier); + if (ret < 0) { + unregister_pernet_subsys(&brnf_net_ops); + return ret; + } + #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); return -ENOMEM; } #endif @@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void) static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 40197ff8918a..e9c635eae24d 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) return -ENETDOWN; br_set_state(p, state); - br_log_state(p); br_port_state_selection(p->br); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 302ab0a43725..1b5d145dfcbf 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -900,7 +900,6 @@ static inline void br_nf_core_fini(void) {} #endif /* br_stp.c */ -void br_log_state(const struct net_bridge_port *p); void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index b3cca126b103..e23449094188 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -30,13 +30,6 @@ static const char *const br_port_state_names[] = { [BR_STATE_BLOCKING] = "blocking", }; -void br_log_state(const struct net_bridge_port *p) -{ - br_info(p->br, "port %u(%s) entered %s state\n", - (unsigned int) p->port_no, p->dev->name, - br_port_state_names[p->state]); -} - void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { @@ -52,6 +45,10 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); + else + br_info(p->br, "port %u(%s) entered %s state\n", + (unsigned int) p->port_no, p->dev->name, + br_port_state_names[p->state]); } /* called under bridge lock */ @@ -126,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br, (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay > 0) @@ -407,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->forward_delay_timer); @@ -431,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay != 0) @@ -568,6 +562,14 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) } +/* Set time interval that dynamic forwarding entries live + * For pure software bridge, allow values outside the 802.1 + * standard specification for special cases: + * 0 - entry never ages (all permanant) + * 1 - entry disappears (no persistance) + * + * Offloaded switch entries maybe more restrictive + */ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) { struct switchdev_attr attr = { @@ -579,9 +581,6 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) unsigned long t = clock_t_to_jiffies(ageing_time); int err; - if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME) - return -ERANGE; - err = switchdev_port_attr_set(br->dev, &attr); if (err) return err; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a31ac6ad76a2..984d46263007 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -102,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); } @@ -118,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p) p->topology_change_ack = 0; p->config_pending = 0; - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 5f0f5af0ec35..da058b85aa22 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg) br_topology_change_detection(br); netif_carrier_on(br->dev); } - br_log_state(p); rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); rcu_read_unlock(); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 85e43af4af7a..9309bb4f2a5b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -955,6 +955,13 @@ err_rhtbl: */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = port->dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; struct net_bridge_vlan *vlan; int ret; @@ -962,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { + /* Pass the flags to the hardware bridge */ + ret = switchdev_port_obj_add(port->dev, &v.obj); + if (ret && ret != -EOPNOTSUPP) + return ret; __vlan_add_flags(vlan, flags); return 0; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index fdba3d9fbff3..adc8d7221dbb 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -48,6 +48,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; + struct net *net = sock_net(oldskb->sk); if (!nft_bridge_iphdr_validate(oldskb)) return; @@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = sysctl_ip_default_ttl; + niph->ttl = net->ipv4.sysctl_ip_default_ttl; niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -85,6 +86,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, void *payload; __wsum csum; u8 proto; + struct net *net = sock_net(oldskb->sk); if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) return; @@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); skb_reset_transport_header(nskb); icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 61d7617d9249..b82440e1fcb4 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) tmppkt = NULL; /* Verify that length is correct */ - err = EPROTO; + err = -EPROTO; if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1) goto out; } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb22d5bb..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_TAKE: if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || - (-1-curstep->arg1 < map->max_buckets && + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && map->buckets[-1-curstep->arg1])) { w[0] = curstep->arg1; wsize = 1; @@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, osize = 0; for (i = 0; i < wsize; i++) { + int bno; /* * see CRUSH_N, CRUSH_N_MINUS macros. * basically, numrep <= 0 means relative to @@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + /* w[i] is probably CRUSH_ITEM_NONE */ + dprintk(" bad w[i] %d\n", w[i]); + continue; + } if (firstn) { int recurse_tries; if (choose_leaf_tries) @@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, numrep, curstep->arg2, @@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { @@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, out_size, numrep, curstep->arg2, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9cfedf565f5b..9382619a405b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1197,6 +1197,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, return new_piece; } +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { BUG_ON(!msg); @@ -2335,9 +2342,9 @@ static int read_partial_message(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; - return 0; + return 1; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); @@ -2360,10 +2367,10 @@ static int read_partial_message(struct ceph_connection *con) /* skip this message */ dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; - return 0; + return 1; } BUG_ON(!con->in_msg); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..5bc053778fed 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; @@ -2843,8 +2853,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - pr_warn("%s osd%d tid %llu unknown, skipping\n", - __func__, osd->o_osd, tid); + dout("%s osd%d tid %llu unknown, skipping\n", __func__, + osd->o_osd, tid); m = NULL; *skip = 1; goto out; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..243574c8cf33 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); - dout("crush decode tunable choose_local_tries = %d", + dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); - dout("crush decode tunable choose_local_fallback_tries = %d", + dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); - dout("crush decode tunable choose_total_tries = %d", + dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); - dout("crush decode tunable chooseleaf_descend_once = %d", + dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); - dout("crush decode tunable chooseleaf_vary_r = %d", + dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); + /* skip straw_calc_version, allowed_bucket_algs */ + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); + *p += sizeof(u8) + sizeof(u32); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_stable = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_stable = %d\n", + c->chooseleaf_stable); + done: dout("crush_decode success\n"); return c; diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de04de3..014422e2561f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,5 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/dev.c b/net/core/dev.c index 3f4071a84a03..edb7179bc051 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5389,12 +5389,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry((*iter)->next, struct netdev_adjacent, list); + lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = &lower->list; + *iter = lower->list.next; return lower->dev; } @@ -7440,8 +7440,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); - if (!dev->tx_queue_len) + if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; diff --git a/net/core/devlink.c b/net/core/devlink.c new file mode 100644 index 000000000000..590fa561cb7f --- /dev/null +++ b/net/core/devlink.c @@ -0,0 +1,738 @@ +/* + * net/core/devlink.c - Network physical/parent device Netlink interface + * + * Heavily inspired by net/wireless/ + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <rdma/ib_verbs.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/devlink.h> + +static LIST_HEAD(devlink_list); + +/* devlink_mutex + * + * An overall lock guarding every operation coming from userspace. + * It also guards devlink devices list and it is taken when + * driver registers/unregisters it. + */ +static DEFINE_MUTEX(devlink_mutex); + +/* devlink_port_mutex + * + * Shared lock to guard lists of ports in all devlink devices. + */ +static DEFINE_MUTEX(devlink_port_mutex); + +static struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} + +static void devlink_net_set(struct devlink *devlink, struct net *net) +{ + write_pnet(&devlink->_net, net); +} + +static struct devlink *devlink_get_from_attrs(struct net *net, + struct nlattr **attrs) +{ + struct devlink *devlink; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + list_for_each_entry(devlink, &devlink_list, list) { + if (strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0 && + net_eq(devlink_net(devlink), net)) + return devlink; + } + + return ERR_PTR(-ENODEV); +} + +static struct devlink *devlink_get_from_info(struct genl_info *info) +{ + return devlink_get_from_attrs(genl_info_net(info), info->attrs); +} + +static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + int port_index) +{ + struct devlink_port *devlink_port; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (devlink_port->index == port_index) + return devlink_port; + } + return NULL; +} + +static bool devlink_port_index_exists(struct devlink *devlink, int port_index) +{ + return devlink_port_get_by_index(devlink, port_index); +} + +static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return ERR_PTR(-ENODEV); + return devlink_port; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_port_get_from_attrs(devlink, info->attrs); +} + +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) + +static int devlink_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_info(info); + if (IS_ERR(devlink)) { + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink); + } + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + struct devlink_port *devlink_port; + + mutex_lock(&devlink_port_mutex); + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_port); + } + info->user_ptr[1] = devlink_port; + } + return 0; +} + +static void devlink_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); +} + +static struct genl_family devlink_nl_family = { + .id = GENL_ID_GENERATE, + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, +}; + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure; + if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && + nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, + devlink_port->desired_type)) + goto nla_put_failure; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net_device *netdev = devlink_port->type_dev; + + if (netdev && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + netdev->ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, + netdev->name))) + goto nla_put_failure; + } + if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { + struct ib_device *ibdev = devlink_port->type_dev; + + if (ibdev && + nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, + ibdev->name)) + goto nla_put_failure; + } + if (devlink_port->split && + nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + devlink_port->split_group)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + if (!devlink_port->registered) + return; + + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_PORT_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_port *devlink_port; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_port_type_set(struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_port_type port_type) + +{ + int err; + + if (devlink->ops && devlink->ops->port_type_set) { + if (port_type == DEVLINK_PORT_TYPE_NOTSET) + return -EINVAL; + err = devlink->ops->port_type_set(devlink_port, port_type); + if (err) + return err; + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; + } + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + int err; + + if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { + enum devlink_port_type port_type; + + port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); + err = devlink_port_type_set(devlink, devlink_port, port_type); + if (err) + return err; + } + return 0; +} + +static int devlink_port_split(struct devlink *devlink, + u32 port_index, u32 count) + +{ + if (devlink->ops && devlink->ops->port_split) + return devlink->ops->port_split(devlink, port_index, count); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + u32 count; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || + !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + return devlink_port_split(devlink, port_index, count); +} + +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) + +{ + if (devlink->ops && devlink->ops->port_unsplit) + return devlink->ops->port_unsplit(devlink, port_index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + return devlink_port_unsplit(devlink, port_index); +} + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, +}; + +static const struct genl_ops devlink_nl_ops[] = { + { + .cmd = DEVLINK_CMD_GET, + .doit = devlink_nl_cmd_get_doit, + .dumpit = devlink_nl_cmd_get_dumpit, + .policy = devlink_nl_policy, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .doit = devlink_nl_cmd_port_get_doit, + .dumpit = devlink_nl_cmd_port_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_SET, + .doit = devlink_nl_cmd_port_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .doit = devlink_nl_cmd_port_split_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .doit = devlink_nl_cmd_port_unsplit_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +/** + * devlink_alloc - Allocate new devlink instance resources + * + * @ops: ops + * @priv_size: size of user private data + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +{ + struct devlink *devlink; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + devlink->ops = ops; + devlink_net_set(devlink, &init_net); + INIT_LIST_HEAD(&devlink->port_list); + return devlink; +} +EXPORT_SYMBOL_GPL(devlink_alloc); + +/** + * devlink_register - Register devlink instance + * + * @devlink: devlink + */ +int devlink_register(struct devlink *devlink, struct device *dev) +{ + mutex_lock(&devlink_mutex); + devlink->dev = dev; + list_add_tail(&devlink->list, &devlink_list); + devlink_notify(devlink, DEVLINK_CMD_NEW); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devlink_unregister - Unregister devlink instance + * + * @devlink: devlink + */ +void devlink_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink_notify(devlink, DEVLINK_CMD_DEL); + list_del(&devlink->list); + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +/** + * devlink_port_register - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + mutex_lock(&devlink_port_mutex); + if (devlink_port_index_exists(devlink, port_index)) { + mutex_unlock(&devlink_port_mutex); + return -EEXIST; + } + devlink_port->devlink = devlink; + devlink_port->index = port_index; + devlink_port->type = DEVLINK_PORT_TYPE_NOTSET; + devlink_port->registered = true; + list_add_tail(&devlink_port->list, &devlink->port_list); + mutex_unlock(&devlink_port_mutex); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_register); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + mutex_lock(&devlink_port_mutex); + list_del(&devlink_port->list); + mutex_unlock(&devlink_port_mutex); +} +EXPORT_SYMBOL_GPL(devlink_port_unregister); + +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + devlink_port->type = type; + devlink_port->type_dev = type_dev; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_ETH, netdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); + +/** + * devlink_port_type_ib_set - Set port type to InfiniBand + * + * @devlink_port: devlink port + * @ibdev: related IB device + */ +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_IB, ibdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); + +/** + * devlink_port_type_clear - Clear port type + * + * @devlink_port: devlink port + */ +void devlink_port_type_clear(struct devlink_port *devlink_port) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_NOTSET, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_clear); + +/** + * devlink_port_split_set - Set port is split + * + * @devlink_port: devlink port + * @split_group: split group - identifies group split port is part of + */ +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ + devlink_port->split = true; + devlink_port->split_group = split_group; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_split_set); + +static int __init devlink_module_init(void) +{ + return genl_register_family_with_ops_groups(&devlink_nl_family, + devlink_nl_ops, + devlink_nl_mcgrps); +} + +static void __exit devlink_module_exit(void) +{ + genl_unregister_family(&devlink_nl_family); +} + +module_init(devlink_module_init); +module_exit(devlink_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Network physical device Netlink interface"); +MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME); diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8d72..b5cbbe07f786 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000000..3938f3f38d69 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 453c803f1c87..2966cd0d7c93 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; static const char @@ -386,43 +387,461 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { + bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + dst[0] = legacy_u32; +} + +/* return false if src had higher bits set. lower bits always updated. */ +static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) +{ + bool retval = true; + + /* TODO: following test will soon always be true */ + if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); + + bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_fill(ext, 32); + bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + if (bitmap_intersects(ext, src, + __ETHTOOL_LINK_MODE_MASK_NBITS)) { + /* src mask goes beyond bit 31 */ + retval = false; + } + } + *legacy_u32 = src[0]; + return retval; +} + +/* return false if legacy contained non-0 deprecated fields + * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +static bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->transceiver || + legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +/* return false if ksettings link modes had higher bits + * set. legacy_settings always updated (best effort) + */ +static bool +convert_link_ksettings_to_legacy_settings( + struct ethtool_cmd *legacy_settings, + const struct ethtool_link_ksettings *link_ksettings) +{ + bool retval = true; + + memset(legacy_settings, 0, sizeof(*legacy_settings)); + /* this also clears the deprecated fields in legacy structure: + * __u8 transceiver; + * __u32 maxtxpkt; + * __u32 maxrxpkt; + */ + + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->supported, + link_ksettings->link_modes.supported); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->advertising, + link_ksettings->link_modes.advertising); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->lp_advertising, + link_ksettings->link_modes.lp_advertising); + ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); + legacy_settings->duplex + = link_ksettings->base.duplex; + legacy_settings->port + = link_ksettings->base.port; + legacy_settings->phy_address + = link_ksettings->base.phy_address; + legacy_settings->autoneg + = link_ksettings->base.autoneg; + legacy_settings->mdio_support + = link_ksettings->base.mdio_support; + legacy_settings->eth_tp_mdix + = link_ksettings->base.eth_tp_mdix; + legacy_settings->eth_tp_mdix_ctrl + = link_ksettings->base.eth_tp_mdix_ctrl; + return retval; +} + +/* number of 32-bit words to store the user's link mode bitmaps */ +#define __ETHTOOL_LINK_MODE_MASK_NU32 \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + struct { + __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + } link_modes; +}; + +/* Internal kernel helper to query a device ethtool_link_settings. + * + * Backward compatibility note: for compatibility with legacy drivers + * that implement only the ethtool_cmd API, this has to work with both + * drivers implementing get_link_ksettings API and drivers + * implementing get_settings API. When drivers implement get_settings + * and report ethtool_cmd deprecated fields + * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored + * because the resulting struct ethtool_link_settings does not report them. + */ +int __ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + int err; + struct ethtool_cmd cmd; + ASSERT_RTNL(); + if (dev->ethtool_ops->get_link_ksettings) { + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, + link_ksettings); + } + + /* driver doesn't support %ethtool_link_ksettings API. revert to + * legacy %ethtool_cmd API, unless it's not supported either. + * TODO: remove when ethtool_ops::get_settings disappears internally + */ if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt + */ + convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); + return err; } -EXPORT_SYMBOL(__ethtool_get_settings); +EXPORT_SYMBOL(__ethtool_get_link_ksettings); -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +/* convert ethtool_link_usettings in user space to a kernel internal + * ethtool_link_ksettings. return 0 on success, errno on error. + */ +static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, + const void __user *from) { - int err; - struct ethtool_cmd cmd; + struct ethtool_link_usettings link_usettings; + + if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) + return -EFAULT; + + memcpy(&to->base, &link_usettings.base, sizeof(to->base)); + bitmap_from_u32array(to->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + + return 0; +} + +/* convert a kernel internal ethtool_link_ksettings to + * ethtool_link_usettings in user space. return 0 on success, errno on + * error. + */ +static int +store_link_ksettings_for_user(void __user *to, + const struct ethtool_link_ksettings *from) +{ + struct ethtool_link_usettings link_usettings; + + memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + bitmap_to_u32array(link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) + return -EFAULT; + + return 0; +} + +/* Query device for its ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::get_link_ksettings, even if legacy + * ethtool_ops::get_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_GSET for + * this driver, so that they can correctly access the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_get_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err = 0; + struct ethtool_link_ksettings link_ksettings; - err = __ethtool_get_settings(dev, &cmd); + ASSERT_RTNL(); + + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + /* handle bitmap nbits handshake */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) { + /* wrong link mode nbits requested */ + memset(&link_ksettings, 0, sizeof(link_ksettings)); + /* keep cmd field reset to 0 */ + /* send back number of words required as negative val */ + compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, + "need too many bits for link modes!"); + link_ksettings.base.link_mode_masks_nwords + = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); + + /* copy the base fields back to user, not the link + * mode bitmaps + */ + if (copy_to_user(useraddr, &link_ksettings.base, + sizeof(link_ksettings.base))) + return -EFAULT; + + return 0; + } + + /* handshake successful: user/kernel agree on + * link_mode_masks_nwords + */ + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; + /* make sure we tell the right values to user */ + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + + return store_link_ksettings_for_user(useraddr, &link_ksettings); +} + +/* Update device ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::set_link_ksettings, even if legacy + * ethtool_ops::set_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_SSET for + * this driver, so that they can correctly update the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_set_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + /* make sure nbits field has expected value */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + /* copy the whole structure, now that we know it has expected + * format + */ + err = load_link_ksettings_from_user(&link_ksettings, useraddr); + if (err) + return err; + + /* re-check nwords field, just in case */ + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +static void +warn_incomplete_ethtool_legacy_settings_conversion(const char *details) +{ + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", + get_task_comm(name, current), details); +} + +/* Query device for its ethtool_cmd settings. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing get_link_ksettings + * API and drivers implementing get_settings API. When drivers + * implement get_link_ksettings and report higher link mode bits, a + * kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful + * (only the lower link mode bits reported back to user). + */ +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + ASSERT_RTNL(); + + if (dev->ethtool_ops->get_link_ksettings) { + /* First, use link_ksettings API if it is supported */ + int err; + struct ethtool_link_ksettings link_ksettings; + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, + &link_ksettings); + if (err < 0) + return err; + if (!convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings)) + warn_incomplete_ethtool_legacy_settings_conversion( + "link modes are only partially reported"); + + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; + } else { + /* driver doesn't support %ethtool_link_ksettings + * API. revert to legacy %ethtool_cmd API, unless it's + * not supported either. + */ + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + } + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; + return 0; } +/* Update device link settings with given ethtool_cmd. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing set_link_ksettings + * API and drivers implementing set_settings API. When drivers + * implement set_link_ksettings and user's request updates deprecated + * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, and the request is rejected. + */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; - if (!dev->ethtool_ops->set_settings) - return -EOPNOTSUPP; + ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + /* first, try new %ethtool_link_ksettings API. */ + if (dev->ethtool_ops->set_link_ksettings) { + struct ethtool_link_ksettings link_ksettings; + + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, + &cmd)) + return -EINVAL; + + link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, + &link_ksettings); + } + + /* legacy %ethtool_cmd API */ + + /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings + * disappears internally + */ + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + return dev->ethtool_ops->set_settings(dev, &cmd); } @@ -642,6 +1061,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -738,6 +1188,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -897,6 +1355,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1227,14 +1693,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels; + struct ethtool_channels channels, max; + u32 max_rx_in_use = 0; - if (!dev->ethtool_ops->set_channels) + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + dev->ethtool_ops->get_channels(dev, &max); + + /* ensure new counts are within the maximums */ + if ((channels.rx_count > max.max_rx) || + (channels.tx_count > max.max_tx) || + (channels.combined_count > max.max_combined) || + (channels.other_count > max.max_other)) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -1823,13 +2306,121 @@ out: return ret; } +static int ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static int ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; - u32 ethcmd; + u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; @@ -1839,8 +2430,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } /* Allow some commands to be done by anyone */ - switch (ethcmd) { + switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: @@ -2070,6 +2667,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr); + break; + case ETHTOOL_GLINKSETTINGS: + rc = ethtool_get_link_ksettings(dev, useraddr); + break; + case ETHTOOL_SLINKSETTINGS: + rc = ethtool_set_link_ksettings(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 2a6e9562f1ab..6fc3893a6170 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -530,12 +530,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -1333,18 +1335,25 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_LDST_LEN 16U +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[BPF_LDST_LEN]; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1355,14 +1364,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + len))) + if (unlikely(skb_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); + ptr = skb_header_pointer(skb, offset, len, sp->buff); if (unlikely(!ptr)) return -EFAULT; @@ -1371,17 +1378,19 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) memcpy(ptr, from, len); - if (ptr == buf) + if (ptr == sp->buff) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1400,7 +1409,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, to); @@ -1412,7 +1421,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_load_bytes_proto = { +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1432,9 +1441,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1442,6 +1449,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; @@ -1459,7 +1472,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1474,23 +1487,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1501,6 +1522,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); @@ -1508,7 +1531,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1519,6 +1542,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1543,11 +1605,10 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) } skb2->dev = dev; - skb_sender_cpu_clear(skb2); return dev_queue_xmit(skb2); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1596,11 +1657,10 @@ int skb_do_redirect(struct sk_buff *skb) } skb->dev = dev; - skb_sender_cpu_clear(skb); return dev_queue_xmit(skb); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1682,6 +1742,13 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_vlan_pop) return true; + if (func == bpf_skb_store_bytes) + return true; + if (func == bpf_l3_csum_replace) + return true; + if (func == bpf_l4_csum_replace) + return true; + return false; } @@ -1703,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EPROTO; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; +set_compat: to = (struct bpf_tunnel_key *)compat; break; default: @@ -1720,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - if (flags & BPF_F_TUNINFO_IPV6) + if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); - else + to->tunnel_label = be32_to_cpu(info->key.label); + } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy((void *)(long) r2, to, size); @@ -1732,7 +1804,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1742,6 +1814,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1752,10 +1850,12 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1768,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } + if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label)) + return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1776,7 +1878,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; @@ -1785,14 +1890,18 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1802,17 +1911,58 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +#define BPF_TUNLEN_MAX 255 + +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > BPF_TUNLEN_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, + options_len) != 1); + + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1849,6 +1999,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -1864,7 +2016,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d79699c9d1b9..a669dea146c6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,25 +19,12 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool dissector_uses_key(const struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) -{ - return flow_dissector->used_keys & (1 << key_id); -} - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } -static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id, - void *target_container) -{ - return ((char *) target_container) + flow_dissector->offset[key_id]; -} - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) @@ -178,15 +165,16 @@ ip: ip_proto = iph->protocol; - if (!dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -208,7 +196,6 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; - __be32 flow_label; ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); @@ -220,18 +207,21 @@ ipv6: if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { - struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; - - key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target_container); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + memcpy(&key_addrs->v6addrs, &iph->saddr, + sizeof(key_addrs->v6addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - flow_label = ip6_flowlabel(iph); - if (flow_label) { + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, @@ -336,8 +326,11 @@ mpls: } case htons(ETH_P_FCOE): - key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); - /* fall through */ + if ((hlen - nhoff) < FCOE_HEADER_LEN) + goto out_bad; + + nhoff += FCOE_HEADER_LEN; + goto out_good; default: goto out_bad; } @@ -396,6 +389,13 @@ ip_proto_again: goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = nhoff; } key_control->flags |= FLOW_DIS_ENCAPSULATION; @@ -437,13 +437,12 @@ ip_proto_again: key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { - ip_proto = fh->nexthdr; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) goto ip_proto_again; - } } goto out_good; } @@ -730,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, { u32 poff = keys->control.thoff; + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 299cfc24d888..669ecc9f884e 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -27,6 +27,31 @@ #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; @@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + rcu_read_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + } + } +#endif if (likely(ops && ops->build_state)) ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index da7dbc237a5f..2b3f76fe65f4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,7 +29,6 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; - switch (cmd.duplex) { + + switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0260c84ed83c..11fce17274f6 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -9,7 +9,6 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f1efbc39ef6b..2ec86fc552df 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1474cfd2dc1c..20999aa596dd 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); @@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 62737f437c8e..d2d9e5ebf58e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1391,15 +1391,6 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, }; -static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { - [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, - [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, -}; - static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, @@ -2979,6 +2970,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); out: netif_addr_unlock_bh(dev); + cb->args[1] = err; return idx; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -3012,6 +3004,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } + cb->args[1] = 0; for_each_netdev(net, dev) { if (brport_idx && (dev->ifindex != brport_idx)) continue; @@ -3039,12 +3032,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx); } + if (cb->args[1] == -EMSGSIZE) + break; if (dev->netdev_ops->ndo_fdb_dump) idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx); else idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (cb->args[1] == -EMSGSIZE) + break; cops = NULL; } diff --git a/net/core/scm.c b/net/core/scm.c index 14596fb37172..2696aefdc148 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; @@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } @@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a5bd067ec1a3..f044f970f1a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,8 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support @@ -799,9 +801,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget) if (unlikely(!skb)) return; - /* if budget is 0 assume netpoll w/ IRQs disabled */ + /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) { - dev_consume_skb_irq(skb); + dev_consume_skb_any(skb); return; } @@ -1916,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; + struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -1942,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return true; } + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + return false; } @@ -1968,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, @@ -1989,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) - break; - } - -done: if (spd.nr_pages) ret = splice_cb(sk, pipe, &spd); @@ -3021,6 +3016,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** + * skb_push_rcsum - push skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_push on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_push unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len) +{ + skb_push(skb, len); + skb_postpush_rcsum(skb, skb->data, len); + return skb->data; +} + +/** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update * @len: length of data pulled @@ -4165,9 +4178,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, if (!pskb_may_pull(skb_chk, offset)) goto err; - __skb_pull(skb_chk, offset); + skb_pull_rcsum(skb_chk, offset); ret = skb_chkf(skb_chk); - __skb_push(skb_chk, offset); + skb_push_rcsum(skb_chk, offset); if (ret) goto err; @@ -4300,7 +4313,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb_sender_cpu_clear(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); @@ -4496,9 +4508,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) skb->mac_len += VLAN_HLEN; __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 46dc8ad7d050..4493ff820c2c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -987,6 +987,10 @@ set_rcvbuf: sk->sk_incoming_cpu = val; break; + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; default: ret = -ENOPROTOOPT; break; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 95b6139d710c..a6beb7b6ae55 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,7 @@ static int zero = 0; static int one = 1; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &max_skb_frags, + }, { } }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 1e0c600c83ae..b5672e5fe649 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -824,26 +824,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v4_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 45cbe85f0940..4663a01d5039 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -691,26 +691,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v6_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index fa4daba8db55..d8fb47fcad05 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -935,6 +935,14 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; @@ -988,14 +996,6 @@ static int dsa_suspend(struct device *d) struct dsa_switch_tree *dst = platform_get_drvdata(pdev); int i, ret = 0; - dst->master_netdev->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); - for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 40b9ca72aae3..27bf03d11670 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -201,47 +201,6 @@ out: return 0; } -static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, - const struct net_device *bridge, - u16 vid_begin, u16 vid_end) -{ - struct dsa_slave_priv *p; - struct net_device *dev, *vlan_br; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 vid; - int member, err; - - if (!ds->drv->vlan_getnext || !vid_begin) - return -EOPNOTSUPP; - - vid = vid_begin - 1; - - do { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (vid > vid_end) - break; - - member = find_first_bit(members, DSA_MAX_PORTS); - if (member == DSA_MAX_PORTS) - continue; - - dev = ds->ports[member]; - p = netdev_priv(dev); - vlan_br = p->bridge_dev; - if (vlan_br == bridge) - continue; - - netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid); - return -EOPNOTSUPP; - } while (vid < vid_end); - - return err == -ENOENT ? 0 : err; -} - static int dsa_slave_port_vlan_add(struct net_device *dev, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) @@ -254,15 +213,6 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) return -EOPNOTSUPP; - /* If the requested port doesn't belong to the same bridge as - * the VLAN members, fallback to software VLAN (hopefully). - */ - err = dsa_bridge_check_vlan_range(ds, p->bridge_dev, - vlan->vid_begin, - vlan->vid_end); - if (err) - return err; - err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); if (err) return err; @@ -293,41 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 pvid, vid = 0; - int err; - - if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get) - return -EOPNOTSUPP; - - err = ds->drv->port_pvid_get(ds, p->port, &pvid); - if (err) - return err; - - for (;;) { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (!test_bit(p->port, members)) - continue; - - memset(vlan, 0, sizeof(*vlan)); - vlan->vid_begin = vlan->vid_end = vid; - if (vid == pvid) - vlan->flags |= BRIDGE_VLAN_INFO_PVID; + if (ds->drv->port_vlan_dump) + return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); - if (test_bit(p->port, untagged)) - vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; - - err = cb(&vlan->obj); - if (err) - break; - } - - return err == -ENOENT ? 0 : err; + return -EOPNOTSUPP; } static int dsa_slave_port_fdb_add(struct net_device *dev, @@ -385,31 +305,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -/* Return a bitmask of all ports being currently bridged within a given bridge - * device. Note that on leave, the mask will still return the bitmask of ports - * currently bridged, prior to port removal, and this is exactly what we want. - */ -static u32 dsa_slave_br_port_mask(struct dsa_switch *ds, - struct net_device *bridge) -{ - struct dsa_slave_priv *p; - unsigned int port; - u32 mask = 0; - - for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!dsa_is_port_initialized(ds, port)) - continue; - - p = netdev_priv(ds->ports[port]); - - if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT && - p->bridge_dev == bridge) - mask |= 1 << port; - } - - return mask; -} - static int dsa_slave_stp_update(struct net_device *dev, u8 state) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -422,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) return ret; } +static int dsa_slave_vlan_filtering(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + if (ds->drv->port_vlan_filtering) + return ds->drv->port_vlan_filtering(ds, p->port, + attr->u.vlan_filtering); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -438,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = ds->drv->port_stp_update(ds, p->port, attr->u.stp_state); break; + case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: + ret = dsa_slave_vlan_filtering(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; @@ -533,8 +449,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; if (ds->drv->port_join_bridge) - ret = ds->drv->port_join_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, br)); + ret = ds->drv->port_join_bridge(ds, p->port, br); return ret; } @@ -547,8 +462,7 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev) if (ds->drv->port_leave_bridge) - ret = ds->drv->port_leave_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, p->bridge_dev)); + ret = ds->drv->port_leave_bridge(ds, p->port); p->bridge_dev = NULL; @@ -1194,7 +1108,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - phy_disconnect(p->phy); ds->ports[port] = NULL; free_netdev(slave_dev); return ret; @@ -1205,6 +1118,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); + unregister_netdev(slave_dev); free_netdev(slave_dev); return ret; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 103871784e50..66dff5e3d772 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL(eth_header); */ u32 eth_get_headlen(void *data, unsigned int len) { + const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys keys; @@ -134,7 +135,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len, 0)) + sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 737c87a2a41e..0023c9048812 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -207,7 +207,7 @@ static int lowpan_device_event(struct notifier_block *unused, struct net_device *wdev = netdev_notifier_info_to_dev(ptr); if (wdev->type != ARPHRD_IEEE802154) - goto out; + return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: @@ -219,11 +219,10 @@ static int lowpan_device_event(struct notifier_block *unused, lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL); break; default: - break; + return NOTIFY_DONE; } -out: - return NOTIFY_DONE; + return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 775824720b6b..238225b0c970 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE @@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET If unsure, say Y. -config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - default y - ---help--- - Support for Large Receive Offload (ipv4/tcp). - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 62c049b647e9..bfa133691cde 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eade66db214e..0cc923f83e10 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1095,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p) } EXPORT_SYMBOL(inet_unregister_protosw); -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1131,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (new_saddr == old_saddr) return 0; - if (sysctl_ip_dynaddr > 1) { + if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } @@ -1186,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk) * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ - if (!sysctl_ip_dynaddr || + if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) @@ -1386,6 +1380,32 @@ out: return pp; } +#define SECONDS_PER_DAY 86400 + +/* inet_current_timestamp - Return IP network timestamp + * + * Return milliseconds since midnight in network byte order. + */ +__be32 inet_current_timestamp(void) +{ + u32 secs; + u32 msecs; + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + + /* Get secs since midnight. */ + (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); + /* Convert to msecs. */ + msecs = secs * MSEC_PER_SEC; + /* Convert nsec to msec. */ + msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; + + /* Convert to network byte order. */ + return htons(msecs); +} +EXPORT_SYMBOL(inet_current_timestamp); + int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { if (sk->sk_family == AF_INET) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c102eb5ac55c..c34c7544d1db 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!in_dev) - goto out; + goto out_free_skb; arp = arp_hdr(skb); @@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) - goto out; + goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: @@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) - goto out; + goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) - goto out; + goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) - goto out; + goto out_free_skb; break; } @@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) - goto out; + goto out_free_skb; /* * Extract fields @@ -733,7 +733,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) - goto out; + goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), @@ -741,7 +741,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) - goto out; + goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address @@ -778,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); - goto out; + goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && @@ -803,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } } - goto out; + goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || @@ -826,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) in_dev->arp_parms, skb); goto out_free_dst; } - goto out; + goto out_consume_skb; } } } @@ -876,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } -out: +out_consume_skb: consume_skb(skb); + out_free_dst: dst_release(reply_dst); - return 0; + return NET_RX_SUCCESS; + +out_free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) @@ -924,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, consumeskb: consume_skb(skb); - return 0; + return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: - return 0; + return NET_RX_DROP; } /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3d835313575e..65e76a48382c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1194,6 +1194,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; struct in_device *in_dev; struct net *net = dev_net(dev); + int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1214,12 +1215,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (addr) goto out_unlock; no_in_dev: + master_idx = l3mdev_master_ifindex_rcu(dev); + + /* For VRFs, the VRF device takes the place of the loopback device, + * with addresses on it being preferred. Note in such cases the + * loopback device will be among the devices that fail the master_idx + * equality check in the loop below. + */ + if (master_idx && + (dev = dev_get_by_index_rcu(net, master_idx)) && + (in_dev = __in_dev_get_rcu(dev))) { + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock; + } + } endfor_ifa(in_dev); + } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; @@ -1731,17 +1753,20 @@ static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; - /* type -1 is used for ALL */ - if (type == -1 || type == NETCONFA_FORWARDING) + if (type == NETCONFA_ALL) + all = true; + + if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_RP_FILTER) + if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_MC_FORWARDING) + if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_PROXY_NEIGH) + if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; @@ -1754,36 +1779,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, { struct nlmsghdr *nlh; struct netconfmsg *ncm; + bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; + if (type == NETCONFA_ALL) + all = true; + ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; - /* type -1 is used for ALL */ - if ((type == -1 || type == NETCONFA_FORWARDING) && + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF(*devconf, FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_RP_FILTER) && + if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; @@ -1847,7 +1875,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -1871,14 +1899,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, - -1); + NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1922,7 +1950,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) { + NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } @@ -1938,7 +1966,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -1949,7 +1977,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 88dab0c1670c..780484243e14 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -319,8 +319,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, skb_gro_pull(skb, hdrlen); - flush = 0; - for (p = *head; p; p = p->next) { const struct guehdr *guehdr2; @@ -352,6 +350,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 003b0ebbcfdd..540866dbd27d 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum; + bool need_csum, ufo; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -58,8 +58,20 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); skb->encap_hdr_csum = need_csum; + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + features &= skb->dev->hw_enc_features; + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags based + * on the fact that we will be computing our checksum in software. + */ + if (ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum) + features |= NETIF_F_HW_CSUM; + } + /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { @@ -75,8 +87,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, struct gre_base_hdr *greh; __be32 *pcsum; - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } skb->mac_len = mac_len; skb->protocol = protocol; @@ -160,8 +175,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - flush = 0; - for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; @@ -198,6 +211,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, greh, grehlen); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e26977c908..6333489771ed 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb) */ static bool icmp_timestamp(struct sk_buff *skb) { - struct timespec tv; struct icmp_bxm icmp_param; /* * Too short. @@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - getnstimeofday(&tv); - icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + - tv.tv_nsec / NSEC_PER_MSEC); + icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7c95335bf85e..9b4ca87f70ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -350,9 +350,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) skb_dst_set(skb, &rt->dst); skb->dev = dev; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); @@ -1224,7 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); +#endif if (im->loaded == 0) { im->loaded = 1; @@ -1316,7 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev, void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -1643,7 +1646,9 @@ void ip_mc_down(struct in_device *in_dev) void ip_mc_init_dev(struct in_device *in_dev) { +#ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST @@ -1662,7 +1667,9 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -2923,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } + /* Sysctl initialization */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3d28c6d5c3c3..bc5196ea1bdf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -202,6 +202,7 @@ tb_found: if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && + !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) goto success; @@ -782,14 +783,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, reqsk_put(req); } -void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, - struct sock *child) +struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); + child = NULL; } else { req->sk = child; req->dl_next = NULL; @@ -801,6 +804,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); + return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); @@ -810,11 +814,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, if (own_req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; } /* Too bad, another child took ownership of the request, undo. */ bh_unlock_sock(child); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49b28fb..000000000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * linux/net/ipv4/inet_lro.c - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann <themann@de.ibm.com> - * Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ - (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, - int len, const struct net_lro_desc *lro_desc) -{ - /* check ip header: don't aggregate padded frames */ - if (ntohs(iph->tot_len) != len) - return -1; - - if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) - return -1; - - if (iph->ihl != IPH_LEN_WO_OPTIONS) - return -1; - - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || - tcph->rst || tcph->syn || tcph->fin) - return -1; - - if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) - return -1; - - if (tcph->doff != TCPH_LEN_WO_OPTIONS && - tcph->doff != TCPH_LEN_W_TIMESTAMP) - return -1; - - /* check tcp options (only timestamp allowed) */ - if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { - __be32 *topt = (__be32 *)(tcph + 1); - - if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return -1; - - /* timestamp should be in right order */ - topt++; - if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), - ntohl(*topt))) - return -1; - - /* timestamp reply should not be zero */ - topt++; - if (*topt == 0) - return -1; - } - - return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ - struct iphdr *iph = lro_desc->iph; - struct tcphdr *tcph = lro_desc->tcph; - __be32 *p; - __wsum tcp_hdr_csum; - - tcph->ack_seq = lro_desc->tcp_ack; - tcph->window = lro_desc->tcp_window; - - if (lro_desc->tcp_saw_tstamp) { - p = (__be32 *)(tcph + 1); - *(p+2) = lro_desc->tcp_rcv_tsecr; - } - - csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); - iph->tot_len = htons(lro_desc->ip_tot_len); - - tcph->check = 0; - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); - lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); - tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - lro_desc->ip_tot_len - - IP_HDR_LEN(iph), IPPROTO_TCP, - lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ - __wsum tcp_csum; - __wsum tcp_hdr_csum; - __wsum tcp_ps_hdr_csum; - - tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - - tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - len + TCP_HDR_LEN(tcph), - IPPROTO_TCP, 0); - - return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), - tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - int nr_frags; - __be32 *ptr; - u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - nr_frags = skb_shinfo(skb)->nr_frags; - lro_desc->parent = skb; - lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); - lro_desc->iph = iph; - lro_desc->tcph = tcph; - lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; - lro_desc->tcp_ack = tcph->ack_seq; - lro_desc->tcp_window = tcph->window; - - lro_desc->pkt_aggr_cnt = 1; - lro_desc->ip_tot_len = ntohs(iph->tot_len); - - if (tcph->doff == 8) { - ptr = (__be32 *)(tcph+1); - lro_desc->tcp_saw_tstamp = 1; - lro_desc->tcp_rcv_tsval = *(ptr+1); - lro_desc->tcp_rcv_tsecr = *(ptr+2); - } - - lro_desc->mss = tcp_data_len; - lro_desc->active = 1; - - lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, - tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ - memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, - struct tcphdr *tcph, int tcp_data_len) -{ - struct sk_buff *parent = lro_desc->parent; - __be32 *topt; - - lro_desc->pkt_aggr_cnt++; - lro_desc->ip_tot_len += tcp_data_len; - lro_desc->tcp_next_seq += tcp_data_len; - lro_desc->tcp_window = tcph->window; - lro_desc->tcp_ack = tcph->ack_seq; - - /* don't update tcp_rcv_tsval, would not work with PAWS */ - if (lro_desc->tcp_saw_tstamp) { - topt = (__be32 *) (tcph + 1); - lro_desc->tcp_rcv_tsecr = *(topt + 2); - } - - lro_desc->data_csum = csum_block_add(lro_desc->data_csum, - lro_tcp_data_csum(iph, tcph, - tcp_data_len), - parent->len); - - parent->len += tcp_data_len; - parent->data_len += tcp_data_len; - if (tcp_data_len > lro_desc->mss) - lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *parent = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb_pull(skb, (skb->len - tcp_data_len)); - parent->truesize += skb->truesize; - - if (lro_desc->last_skb) - lro_desc->last_skb->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, - struct iphdr *iph, - struct tcphdr *tcph) -{ - if ((lro_desc->iph->saddr != iph->saddr) || - (lro_desc->iph->daddr != iph->daddr) || - (lro_desc->tcph->source != tcph->source) || - (lro_desc->tcph->dest != tcph->dest)) - return -1; - return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_arr, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc = NULL; - struct net_lro_desc *tmp; - int max_desc = lro_mgr->max_desc; - int i; - - for (i = 0; i < max_desc; i++) { - tmp = &lro_arr[i]; - if (tmp->active) - if (!lro_check_tcp_conn(tmp, iph, tcph)) { - lro_desc = tmp; - goto out; - } - } - - for (i = 0; i < max_desc; i++) { - if (!lro_arr[i].active) { - lro_desc = &lro_arr[i]; - goto out; - } - } - - LRO_INC_STATS(lro_mgr, no_desc); -out: - return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_desc) -{ - if (lro_desc->pkt_aggr_cnt > 1) - lro_update_tcp_ip_header(lro_desc); - - skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - - LRO_INC_STATS(lro_mgr, flushed); - lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - void *priv) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - u64 flags; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_skb_header || - lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) - goto out; - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) - goto out; - - skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return 0; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) - goto out2; - - lro_add_packet(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return 0; - -out2: /* send aggregated SKBs to stack */ - lro_flush(lro_mgr, lro_desc); - -out: - return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); - } -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ - int i; - struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - - for (i = 0; i < lro_mgr->max_desc; i++) { - if (lro_desc[i].active) - lro_flush(lro_mgr, &lro_desc[i]); - } -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index da0d7ce85844..af18f1e4889e 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(opt->optlen)) ip_forward_options(skb); - skb_sender_cpu_clear(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 187c6fcc3027..efbd47d1a531 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? + qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; + unsigned int max = qp->q.net->max_dist; unsigned int start, end; int rc; @@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ipfrag_max_dist", + .data = &init_net.ipv4.frags.max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, { } }; @@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero - }, { } }; @@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[3].data = &net->ipv4.frags.max_dist; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.frags.max_dist = 64; + res = inet_frags_init_net(&net->ipv4.frags); if (res) return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 65748db44285..31936d387cfd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return iptunnel_pull_header(skb, hdr_len, tpi->proto, false); } static void ipgre_err(struct sk_buff *skb, u32 info, @@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -540,9 +541,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); @@ -1063,8 +1072,9 @@ static const struct net_device_ops gre_tap_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->netdev_ops = &gre_tap_netdev_ops; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } @@ -1249,6 +1259,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, err = ipgre_newlink(net, dev, tb, NULL); if (err < 0) goto out; + + /* openvswitch users expect packet sizes to be unrestricted, + * so set the largest MTU we can. + */ + err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); + if (err) + goto out; + return dev; out: free_netdev(dev); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 852002f64c68..e3d782746d9d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,12 @@ drop: return true; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); - static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bd246792360b..4d158ff1def1 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, if (opt->ts_needaddr) ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); if (opt->ts_needtime) { - struct timespec tv; __be32 midtime; - getnstimeofday(&tv); - midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + + midtime = inet_current_timestamp(); memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); } return; @@ -415,11 +414,10 @@ int ip_options_compile(struct net *net, break; } if (timeptr) { - struct timespec tv; - u32 midtime; - getnstimeofday(&tv); - midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; - put_unaligned_be32(midtime, timeptr); + __be32 midtime; + + midtime = inet_current_timestamp(); + memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 64878efa045c..124bf0a66328 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); - static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, @@ -1236,13 +1233,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { + if (skb->ip_summed != CHECKSUM_PARTIAL) + return -EOPNOTSUPP; + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } + cork->length += size; while (size > 0) { if (skb_is_gso(skb)) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 92808f147ef5..035ad645a8d9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + + /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) @@ -1341,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->tos; break; case IP_TTL: + { + struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : + net->ipv4.sysctl_ip_default_ttl : inet->uc_ttl); break; + } case IP_HDRINCL: val = inet->hdrincl; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c7bd72e9b544..6aad0192443d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -661,6 +607,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -729,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -739,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -758,7 +708,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; @@ -836,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -943,17 +892,31 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); -int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + if (new_mtu < 68) return -EINVAL; + + if (new_mtu > max_mtu) { + if (strict) + return -EINVAL; + + new_mtu = max_mtu; + } + dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + return __ip_tunnel_change_mtu(dev, new_mtu, true); +} EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) @@ -961,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1155,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1193,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a6e58b6141cd..eaca2449a09a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb, xnet); return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 6ec5b42fd172..ec51d02166de 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b488cac9c5ca..bf081927e06b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1780,9 +1780,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, - const struct xt_table *table, - const struct arpt_replace *repl) +static void __arpt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct arpt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int arpt_register_table(struct net *net, + const struct xt_table *table, + const struct arpt_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -1791,10 +1811,8 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1809,30 +1827,28 @@ struct xt_table *arpt_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __arpt_unregister_table(new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void arpt_unregister_table(struct xt_table *table) +void arpt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct arpt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __arpt_unregister_table(table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 1897ee160920..dd8c80dc32a2 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) +static int __net_init arptable_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, + .table_init = arptable_filter_table_init, }; /* The work comes in here from netfilter.c */ @@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_net_init(struct net *net) +static int __net_init arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; - + int err; + + if (net->ipv4.arptable_filter) + return 0; + repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, repl); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, + &net->ipv4.arptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); + return err; } static void __net_exit arptable_filter_net_exit(struct net *net) { - arpt_unregister_table(net->ipv4.arptable_filter); + if (!net->ipv4.arptable_filter) + return; + arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); + net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { - .init = arptable_filter_net_init, .exit = arptable_filter_net_exit, }; @@ -62,26 +71,23 @@ static int __init arptable_filter_init(void) { int ret; + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) + return PTR_ERR(arpfilter_ops); + ret = register_pernet_subsys(&arptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(arpfilter_ops); return ret; - - arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) { - ret = PTR_ERR(arpfilter_ops); - goto cleanup_table; } - return ret; -cleanup_table: - unregister_pernet_subsys(&arptable_filter_net_ops); return ret; } static void __exit arptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } module_init(arptable_filter_init); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6ba1..e53f8d6f326d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2062,9 +2062,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, - const struct xt_table *table, - const struct ipt_replace *repl) +static void __ipt_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ipt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ipt_register_table(struct net *net, const struct xt_table *table, + const struct ipt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2073,10 +2091,8 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2091,30 +2107,27 @@ struct xt_table *ipt_register_table(struct net *net, goto out_free; } - return new_table; + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ipt_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ipt_unregister_table(struct net *net, struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ipt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc556514ba..7b8fbb352877 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -21,6 +21,7 @@ static struct iphdr * synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; + struct net *net = sock_net(skb->sk); skb_reset_network_header(skb); iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = net->ipv4.sysctl_ip_default_ttl; iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 397ef2dd133e..7667f223d7f8 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -30,6 +31,7 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, + .table_init = iptable_filter_table_init, }; static unsigned int @@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = true; +static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_net_init(struct net *net) +static int __net_init iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; + int err; + + if (net->ipv4.iptable_filter) + return 0; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, repl); + err = ipt_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv4.iptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); + return err; +} + +static int __net_init iptable_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return iptable_filter_table_init(net); + + return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_filter); + if (!net->ipv4.iptable_filter) + return; + ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { @@ -82,24 +99,21 @@ static int __init iptable_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - unregister_pernet_subsys(&iptable_filter_net_ops); - } + kfree(filter_ops); return ret; } static void __exit iptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); + kfree(filter_ops); } module_init(iptable_filter_init); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index ba5d392a13c4..57fc97cdac70 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init iptable_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, + .table_init = iptable_mangle_table_init, }; static unsigned int @@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv, } static struct nf_hook_ops *mangle_ops __read_mostly; - -static int __net_init iptable_mangle_net_init(struct net *net) +static int __net_init iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_mangle) + return 0; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, repl); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv4.iptable_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); + return ret; } static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_mangle); + if (!net->ipv4.iptable_mangle) + return; + ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { - .init = iptable_mangle_net_init, .exit = iptable_mangle_net_exit, }; @@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + return ret; + } + ret = register_pernet_subsys(&iptable_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; + } - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); + ret = iptable_mangle_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } return ret; @@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } module_init(iptable_mangle_init); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ae2cd2752046..138a24bc76ad 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -18,6 +18,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init iptable_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, + .table_init = iptable_nat_table_init, }; static unsigned int iptable_nat_do_chain(void *priv, @@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, }; -static int __net_init iptable_nat_net_init(struct net *net) +static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.nat_table) + return 0; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, + nf_nat_ipv4_ops, &net->ipv4.nat_table); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.nat_table); + return ret; } static void __net_exit iptable_nat_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.nat_table); + if (!net->ipv4.nat_table) + return; + ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); + net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { - .init = iptable_nat_net_init, .exit = iptable_nat_net_exit, }; static int __init iptable_nat_init(void) { - int err; + int ret = register_pernet_subsys(&iptable_nat_net_ops); - err = register_pernet_subsys(&iptable_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&iptable_nat_net_ops); -err1: - return err; + ret = iptable_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&iptable_nat_net_ops); + return ret; } static void __exit iptable_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); unregister_pernet_subsys(&iptable_nat_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1ba02811acb0..2642ecd2645c 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -10,12 +10,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, + .table_init = iptable_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_net_init(struct net *net) +static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_raw) + return 0; repl = ipt_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, repl); + ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv4.iptable_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); + return ret; } static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_raw); + if (!net->ipv4.iptable_raw) + return; + ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { - .init = iptable_raw_net_init, .exit = iptable_raw_net_exit, }; @@ -61,15 +70,20 @@ static int __init iptable_raw_init(void) { int ret; + rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&iptable_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; + } - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); + ret = iptable_raw_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } return ret; @@ -77,8 +91,8 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c2e23d5e9cd4..ff226596e4b5 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init iptable_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, + .table_init = iptable_security_table_init, }; static unsigned int @@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_net_init(struct net *net) +static int __net_init iptable_security_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_security) + return 0; repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, repl); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv4.iptable_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); + return ret; } static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_security); + if (!net->ipv4.iptable_security) + return; + + ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { - .init = iptable_security_net_init, .exit = iptable_security_net_exit, }; @@ -78,27 +88,29 @@ static int __init iptable_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; + ret = iptable_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); + } -cleanup_table: - unregister_pernet_subsys(&iptable_security_net_ops); return ret; } static void __exit iptable_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a04dee536b8e..d88da36b383c 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, err = ip_defrag(net, skb, user); local_bh_enable(); - if (!err) { - ip_send_check(ip_hdr(skb)); + if (!err) skb->ignore_df = 1; - } return err; } diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index b72ffc58e255..51ced81b616c 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index f6f93fc2c61f..cf9700b1a106 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -748,8 +748,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; } @@ -1140,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations ping_v4_seq_ops = { - .show = ping_v4_seq_show, - .start = ping_v4_seq_start, - .next = ping_seq_next, - .stop = ping_seq_stop, -}; - static int ping_seq_open(struct inode *inode, struct file *file) { struct ping_seq_afinfo *afinfo = PDE_DATA(inode); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7a3adf..9f665b63a927 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d6352515d738..8d22de74080c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -549,8 +549,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(net, msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); goto out; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85f184e429c6..02c62299d717 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ @@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, 0); + 0, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) + if (fnhe) { rth = rcu_dereference(fnhe->fnhe_rth_input); - else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(&FIB_RES_NH(*res), daddr); + fnhe = NULL; + } else { + goto rt_cache; + } + } + + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); +rt_cache: if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); - if (fnhe) + if (fnhe) { prth = &fnhe->fnhe_rth_output; - else { - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; + rth = rcu_dereference(*prth); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(nh, fl4->daddr); + fnhe = NULL; + } else { + goto rt_cache; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } + + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); + +rt_cache: if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; @@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b537338f5c97..1e1fe6086dd9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -283,15 +283,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_default_ttl", - .data = &sysctl_ip_default_ttl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &ip_ttl_min, - .extra2 = &ip_ttl_max, - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -306,20 +297,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_early_demux", - .data = &sysctl_ip_early_demux, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ip_dynaddr", - .data = &sysctl_ip_dynaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -753,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_dynaddr", + .data = &init_net.ipv4.sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_early_demux", + .data = &init_net.ipv4.sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &init_net.ipv4.sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_dynaddr = 0; + net->ipv4.sysctl_ip_early_demux = 1; + return 0; err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 014f18e2f7b3..a265f00b9df9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: @@ -938,7 +925,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1211,7 +1198,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { + if (i == sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } @@ -2642,6 +2629,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; u64 rate64; u32 rate; @@ -2722,6 +2710,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2954,7 +2947,7 @@ static void __tcp_alloc_md5sig_pool(void) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) + if (IS_ERR(hash)) return; per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5ee6fe0d152d..e6e65f79ade8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2907,7 +2907,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + struct rtt_meas rttm = { + .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), + .ts = now, + }; u32 elapsed; /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ba5d0146e3f0..4c8d58dfac9b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -311,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ -void tcp_req_err(struct sock *sk, u32 seq) +void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); @@ -323,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { + } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly @@ -383,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, + type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -1592,28 +1597,30 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard_and_relse; - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + reqsk_put(req); + goto discard_it; + } + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -2399,12 +2406,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; - net->ipv4.sysctl_igmp_max_memberships = 20; - net->ipv4.sysctl_igmp_max_msf = 10; - /* IGMP reports for link-local multicast groups are enabled by default */ - net->ipv4.sysctl_igmp_llm_reports = 1; - net->ipv4.sysctl_igmp_qrv = 2; - return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c26241f3057b..7b7eec439906 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -551,7 +551,7 @@ reset: */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt /= 8 * USEC_PER_MSEC; + crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fadd8b978951..ae90e4b34bd3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -452,7 +452,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 0; + newtp->segs_in = 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -812,6 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index ebf5ff57526e..f6c50af24a64 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p = tcp_probe.log + tcp_probe.tail; - struct timespec tv - = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + struct timespec64 ts + = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)tv.tv_sec, - (unsigned long)tv.tv_nsec, + (unsigned long)ts.tv_sec, + (unsigned long)ts.tv_nsec, &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9fc4e9c06aae..836abe58a9c5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1036,8 +1036,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, sk->sk_family == AF_INET6); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 56c4c8b88b28..f5abb1ae1358 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -33,8 +33,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool remcsum, need_csum, offload_csum, ufo; struct sk_buff *segs = ERR_PTR(-EINVAL); - bool remcsum, need_csum, offload_csum; struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; @@ -62,6 +62,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + /* Try to offload checksum if possible */ offload_csum = !!(need_csum && (skb->dev->features & @@ -74,9 +76,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * outer one so strip the existing checksum feature flags and * instead set the flag based on our outer checksum offload value. */ - if (remcsum) { + if (remcsum || ufo) { features &= ~NETIF_F_CSUM_MASK; - if (offload_csum) + if (!need_csum || offload_csum) features |= NETIF_F_HW_CSUM; } @@ -230,6 +232,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 0ec08814f37d..96599d1a1318 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb uh->source = src_port; uh->len = htons(skb->len); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 40c897515ddc..11e875ffd7ac 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL + select DST_CACHE ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ac0ba9e4e06b..27aed1afcf81 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; /* Check if a valid qdisc is available */ @@ -471,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; - /* type -1 is used for ALL */ - if (type == -1 || type == NETCONFA_FORWARDING) + if (type == NETCONFA_ALL) + all = true; + + if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); #ifdef CONFIG_IPV6_MROUTE - if (type == -1 || type == NETCONFA_MC_FORWARDING) + if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); #endif - if (type == -1 || type == NETCONFA_PROXY_NEIGH) + if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; @@ -495,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, { struct nlmsghdr *nlh; struct netconfmsg *ncm; + bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; + if (type == NETCONFA_ALL) + all = true; + ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET6; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; - /* type -1 is used for ALL */ - if ((type == -1 || type == NETCONFA_FORWARDING) && + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) goto nla_put_failure; #ifdef CONFIG_IPV6_MROUTE - if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, devconf->mc_forwarding) < 0) goto nla_put_failure; #endif - if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, devconf->ignore_routes_with_linkdown) < 0) goto nla_put_failure; @@ -583,7 +591,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -607,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); if (!skb) goto errout; err = inet6_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, - -1); + NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -658,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) { + NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } @@ -674,7 +682,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -685,7 +693,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -3168,6 +3176,81 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) +/* If the host route is cached on the addr struct make sure it is associated + * with the proper table. e.g., enslavement can change and if so the cached + * host route needs to move to the new table. + */ +static void l3mdev_check_host_rt(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + if (ifp->rt) { + u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; + + if (tb_id != ifp->rt->rt6i_table->tb6_id) { + ip6_del_rt(ifp->rt); + ifp->rt = NULL; + } + } +} +#else +static void l3mdev_check_host_rt(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ +} +#endif + +static int fixup_permanent_addr(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + l3mdev_check_host_rt(idev, ifp); + + if (!ifp->rt) { + struct rt6_info *rt; + + rt = addrconf_dst_alloc(idev, &ifp->addr, false); + if (unlikely(IS_ERR(rt))) + return PTR_ERR(rt); + + ifp->rt = rt; + } + + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + idev->dev, 0, 0); + } + + addrconf_dad_start(ifp); + + return 0; +} + +static void addrconf_permanent_addr(struct net_device *dev) +{ + struct inet6_ifaddr *ifp, *tmp; + struct inet6_dev *idev; + + idev = __in6_dev_get(dev); + if (!idev) + return; + + write_lock_bh(&idev->lock); + + list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { + if ((ifp->flags & IFA_F_PERMANENT) && + fixup_permanent_addr(idev, ifp) < 0) { + write_unlock_bh(&idev->lock); + ipv6_del_addr(ifp); + write_lock_bh(&idev->lock); + + net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", + idev->dev->name, &ifp->addr); + } + } + + write_unlock_bh(&idev->lock); +} + static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -3253,6 +3336,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + switch (dev->type) { #if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: @@ -3356,7 +3442,10 @@ static int addrconf_ifdown(struct net_device *dev, int how) { struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + struct inet6_ifaddr *ifa, *tmp; + struct list_head del_list; + int _keep_addr; + bool keep_addr; int state, i; ASSERT_RTNL(); @@ -3383,6 +3472,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) } + /* aggregate the system setting and interface setting */ + _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + if (!_keep_addr) + _keep_addr = idev->cnf.keep_addr_on_down; + + /* combine the user config with event to determine if permanent + * addresses are to be removed from address hash table + */ + keep_addr = !(how || _keep_addr <= 0); + /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; @@ -3391,9 +3490,15 @@ static int addrconf_ifdown(struct net_device *dev, int how) restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_dad_work(ifa); - goto restart; + /* combined flag + permanent flag decide if + * address is retained on a down event + */ + if (!keep_addr || + !(ifa->flags & IFA_F_PERMANENT)) { + hlist_del_init_rcu(&ifa->addr_lst); + goto restart; + } } } spin_unlock_bh(&addrconf_hash_lock); @@ -3427,31 +3532,53 @@ restart: write_lock_bh(&idev->lock); } - while (!list_empty(&idev->addr_list)) { - ifa = list_first_entry(&idev->addr_list, - struct inet6_ifaddr, if_list); - addrconf_del_dad_work(ifa); + /* re-combine the user config with event to determine if permanent + * addresses are to be removed from the interface list + */ + keep_addr = (!how && _keep_addr > 0); - list_del(&ifa->if_list); + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + addrconf_del_dad_work(ifa); write_unlock_bh(&idev->lock); - spin_lock_bh(&ifa->lock); - state = ifa->state; - ifa->state = INET6_IFADDR_STATE_DEAD; + + if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) { + /* set state to skip the notifier below */ + state = INET6_IFADDR_STATE_DEAD; + ifa->state = 0; + if (!(ifa->flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; + } else { + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + + list_del(&ifa->if_list); + list_add(&ifa->if_list, &del_list); + } + spin_unlock_bh(&ifa->lock); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } - in6_ifa_put(ifa); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, + struct inet6_ifaddr, if_list); + list_del(&ifa->if_list); + + in6_ifa_put(ifa); + } + /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -3538,6 +3665,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; + bool notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3583,7 +3711,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ - ipv6_ifa_notify(RTM_NEWADDR, ifp); + notify = true; } } @@ -3591,6 +3719,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) out: spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); + if (notify) + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) @@ -4713,6 +4843,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; + array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; } static inline size_t inet6_ifla6_size(void) @@ -5801,6 +5932,14 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec, }, { + .procname = "keep_addr_on_down", + .data = &ipv6_devconf.keep_addr_on_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, + { /* sentinel */ } }, diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 5c5d23e59da5..9508a20fbf61 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, *fragoff = _frag_off; return hp->nexthdr; } - return -ENOENT; + if (!found) + return -ENOENT; + if (fragoff) + *fragoff = _frag_off; + break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 32dc9aab7297..30613050e4ca 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -99,5 +99,6 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); +MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0c7e276c230e..ea071fad67a0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -55,8 +55,6 @@ struct fib6_cleaner { void *arg; }; -static DEFINE_RWLOCK(fib6_walker_lock); - #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else @@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker *w); +static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* @@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w); static void fib6_gc_timer_cb(unsigned long arg); -static LIST_HEAD(fib6_walkers); -#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) +#define FOR_WALKERS(net, w) \ + list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) -static void fib6_walker_link(struct fib6_walker *w) +static void fib6_walker_link(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); - list_add(&w->lh, &fib6_walkers); - write_unlock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); + list_add(&w->lh, &net->ipv6.fib6_walkers); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } -static void fib6_walker_unlink(struct fib6_walker *w) +static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); - write_unlock_bh(&fib6_walker_lock); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) @@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w) static void fib6_dump_end(struct netlink_callback *cb) { + struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); @@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; @@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->skip = 0; read_lock_bh(&table->tb6_lock); - res = fib6_walk(w); + res = fib6_walk(net, w); read_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; @@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); if (res <= 0) { - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); cb->args[4] = 0; } } @@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } #endif - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (!child) { if (w->root == fn) { w->root = w->node = NULL; @@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); node_free(fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) @@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, } /* Adjust walkers */ - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rt->dst.rt6_next; @@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, w->state = FWS_U; } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); rt->dst.rt6_next = NULL; @@ -1588,17 +1588,17 @@ skip: } } -static int fib6_walk(struct fib6_walker *w) +static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; - fib6_walker_link(w); + fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); return res; } @@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.arg = arg; c.net = net; - fib6_walk(&c.w); + fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, @@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static struct fib6_gc_args +struct fib6_gc_args { int timeout; int more; -} gc_args; +}; static int fib6_age(struct rt6_info *rt, void *arg) { + struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg) RT6_TRACE("expiring %p\n", rt); return -1; } - gc_args.more++; + gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { if (atomic_read(&rt->dst.__refcnt) == 0 && - time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if (rt->rt6i_flags & RTF_GATEWAY) { @@ -1769,21 +1770,20 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } } - gc_args.more++; + gc_args->more++; } return 0; } -static DEFINE_SPINLOCK(fib6_gc_lock); - void fib6_run_gc(unsigned long expires, struct net *net, bool force) { + struct fib6_gc_args gc_args; unsigned long now; if (force) { - spin_lock_bh(&fib6_gc_lock); - } else if (!spin_trylock_bh(&fib6_gc_lock)) { + spin_lock_bh(&net->ipv6.fib6_gc_lock); + } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } @@ -1792,7 +1792,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) gc_args.more = icmp6_dst_gc(); - fib6_clean_all(net, fib6_age, NULL); + fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -1802,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); - spin_unlock_bh(&fib6_gc_lock); + spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(unsigned long arg) @@ -1814,6 +1814,9 @@ static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + spin_lock_init(&net->ipv6.fib6_gc_lock); + rwlock_init(&net->ipv6.fib6_walker_lock); + INIT_LIST_HEAD(&net->ipv6.fib6_walkers); setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); @@ -1974,7 +1977,8 @@ static int ipv6_route_yield(struct fib6_walker *w) return 0; } -static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, + struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; @@ -1984,7 +1988,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) iter->w.args = iter; iter->sernum = iter->w.root->fn_sernum; INIT_LIST_HEAD(&iter->w.lh); - fib6_walker_link(&iter->w); + fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, @@ -2045,16 +2049,16 @@ iter_table: ++*pos; return iter->w.leaf; } else if (r < 0) { - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); return NULL; } - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); goto iter_table; } @@ -2069,7 +2073,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, pos); } else { return NULL; @@ -2085,10 +2089,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) static void ipv6_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { + struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); rcu_read_unlock_bh(); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1f9ebe3cbb4a..dc2db4f7b182 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp)) != NULL; + (sfl = rcu_dereference_protected(*sflp, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = rcu_dereference(sfl->next); + *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f37f18b6b40c..4e636e60a360 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(tunnel); + dst = dst_cache_get(&tunnel->dst_cache); if (!dst) { dst = ip6_route_output(net, NULL, fl6); @@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(tunnel, ndst); + dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); proto = NEXTHDR_GRE; @@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) __u32 mtu; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; @@ -1009,7 +1011,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1219,7 +1221,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1257,7 +1259,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(tunnel); + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; @@ -1512,6 +1514,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->destructor = ip6gre_dev_free; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 31ac3c56da4b..c05c425c2389 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a163102f1803..9428345d3a07 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -332,7 +332,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb_sender_cpu_clear(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca42aaa6..eb2ac4bb09ce 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - write_seqlock_bh(&idst->lock); - dst_release(rcu_dereference_protected( - idst->dst, - lockdep_is_held(&idst->lock.lock))); - if (dst) { - dst_hold(dst); - idst->cookie = rt6_get_cookie((struct rt6_info *)dst); - } else { - idst->cookie = 0; - } - rcu_assign_pointer(idst->dst, dst); - write_sequnlock_bh(&idst->lock); -} - -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) -{ - struct ip6_tnl_dst *idst; - struct dst_entry *dst; - unsigned int seq; - u32 cookie; - - idst = raw_cpu_ptr(t->dst_cache); - - rcu_read_lock(); - do { - seq = read_seqbegin(&idst->lock); - dst = rcu_dereference(idst->dst); - cookie = idst->cookie; - } while (read_seqretry(&idst->lock, seq)); - - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - - if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { - ip6_tnl_per_cpu_dst_set(idst, NULL); - dst_release(dst); - dst = NULL; - } - return dst; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); - -void ip6_tnl_dst_reset(struct ip6_tnl *t) -{ - int i; - - for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); - -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) -{ - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); - -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); - -void ip6_tnl_dst_destroy(struct ip6_tnl *t) -{ - if (!t->dst_cache) - return; - - ip6_tnl_dst_reset(t); - free_percpu(t->dst_cache); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); - -int ip6_tnl_dst_init(struct ip6_tnl *t) -{ - int i; - - t->dst_cache = alloc_percpu(struct ip6_tnl_dst); - if (!t->dst_cache) - return -ENOMEM; - - for_each_possible_cpu(i) - seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); - /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(t); + dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(t, ndst); + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); skb->transport_header = skb->network_header; @@ -1180,6 +1089,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + tproto = ACCESS_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; @@ -1366,7 +1277,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; } @@ -1637,7 +1548,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(t); + ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 14dacf1df529..a7520528ecd2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck) + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b33d79..d90a11f14040 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ee56d0a8699..d64ee7e83664 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) return NULL; skb->priority = TC_PRIO_CONTROL; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 99425cf2819b..84f9baf7aee8 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -2071,9 +2071,28 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ip6t_register_table(struct net *net, - const struct xt_table *table, - const struct ip6t_replace *repl) +static void __ip6t_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ip6t_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ip6t_register_table(struct net *net, const struct xt_table *table, + const struct ip6t_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2082,10 +2101,8 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2099,30 +2116,28 @@ struct xt_table *ip6t_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ip6t_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ip6t_unregister_table(struct net *net, struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ip6t_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 8b277b983ca5..1343077dde93 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, + .table_init = ip6table_filter_table_init, }; /* The work comes in here from netfilter.c. */ @@ -44,9 +47,13 @@ static struct nf_hook_ops *filter_ops __read_mostly; static bool forward = true; module_param(forward, bool, 0000); -static int __net_init ip6table_filter_net_init(struct net *net) +static int __net_init ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; + int err; + + if (net->ipv6.ip6table_filter) + return 0; repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -55,15 +62,26 @@ static int __net_init ip6table_filter_net_init(struct net *net) ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv6.ip6table_filter = - ip6t_register_table(net, &packet_filter, repl); + err = ip6t_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv6.ip6table_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter); + return err; +} + +static int __net_init ip6table_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return ip6table_filter_table_init(net); + + return 0; } static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_filter); + if (!net->ipv6.ip6table_filter) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops); + net->ipv6.ip6table_filter = NULL; } static struct pernet_operations ip6table_filter_net_ops = { @@ -75,28 +93,21 @@ static int __init ip6table_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - goto cleanup_table; - } + kfree(filter_ops); return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_filter_net_ops); - return ret; } static void __exit ip6table_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&ip6table_filter_net_ops); + kfree(filter_ops); } module_init(ip6table_filter_init); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index abe278b07932..cb2b28883252 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init ip6table_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, + .table_init = ip6table_mangle_table_init, }; static unsigned int @@ -88,26 +91,33 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init ip6table_mangle_net_init(struct net *net) +static int __net_init ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_mangle) + return 0; repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_mangle = - ip6t_register_table(net, &packet_mangler, repl); + ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv6.ip6table_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle); + return ret; } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_mangle); + if (!net->ipv6.ip6table_mangle) + return; + + ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops); + net->ipv6.ip6table_mangle = NULL; } static struct pernet_operations ip6table_mangle_net_ops = { - .init = ip6table_mangle_net_init, .exit = ip6table_mangle_net_exit, }; @@ -115,28 +125,28 @@ static int __init ip6table_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); + ret = register_pernet_subsys(&ip6table_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; - - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_mangle_net_ops); + ret = ip6table_mangle_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); + } return ret; } static void __exit ip6table_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); } module_init(ip6table_mangle_init); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index de2a10a565f5..7d2bd940291f 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -20,6 +20,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init ip6table_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv6_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV6, + .table_init = ip6table_nat_table_init, }; static unsigned int ip6table_nat_do_chain(void *priv, @@ -97,50 +100,50 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, }; -static int __net_init ip6table_nat_net_init(struct net *net) +static int __net_init ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_nat) + return 0; repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl); + ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, + nf_nat_ipv6_ops, &net->ipv6.ip6table_nat); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat); + return ret; } static void __net_exit ip6table_nat_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_nat); + if (!net->ipv6.ip6table_nat) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops); + net->ipv6.ip6table_nat = NULL; } static struct pernet_operations ip6table_nat_net_ops = { - .init = ip6table_nat_net_init, .exit = ip6table_nat_net_exit, }; static int __init ip6table_nat_init(void) { - int err; + int ret = register_pernet_subsys(&ip6table_nat_net_ops); - err = register_pernet_subsys(&ip6table_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&ip6table_nat_net_ops); -err1: - return err; + ret = ip6table_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&ip6table_nat_net_ops); + return ret; } static void __exit ip6table_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); unregister_pernet_subsys(&ip6table_nat_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 9021963565c3..d4bc56443dc1 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -9,12 +9,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, + .table_init = ip6table_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -27,26 +30,32 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init ip6table_raw_net_init(struct net *net) +static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_raw) + return 0; repl = ip6t_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_raw = - ip6t_register_table(net, &packet_raw, repl); + ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv6.ip6table_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw); + return ret; } static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_raw); + if (!net->ipv6.ip6table_raw) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops); + net->ipv6.ip6table_raw = NULL; } static struct pernet_operations ip6table_raw_net_ops = { - .init = ip6table_raw_net_init, .exit = ip6table_raw_net_exit, }; @@ -54,28 +63,29 @@ static int __init ip6table_raw_init(void) { int ret; + /* Register hooks */ + rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&ip6table_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; - - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_raw_net_ops); + ret = ip6table_raw_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); + } return ret; } static void __exit ip6table_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); } module_init(ip6table_raw_init); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 0d856fedfeb0..cf26ccb04056 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init ip6table_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, + .table_init = ip6table_security_table_init, }; static unsigned int @@ -44,26 +47,32 @@ ip6table_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init ip6table_security_net_init(struct net *net) +static int __net_init ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_security) + return 0; repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_security = - ip6t_register_table(net, &security_table, repl); + ret = ip6t_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv6.ip6table_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security); + return ret; } static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_security); + if (!net->ipv6.ip6table_security) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops); + net->ipv6.ip6table_security = NULL; } static struct pernet_operations ip6table_security_net_ops = { - .init = ip6table_security_net_init, .exit = ip6table_security_net_exit, }; @@ -71,27 +80,28 @@ static int __init ip6table_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&ip6table_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; - -cleanup_table: - unregister_pernet_subsys(&ip6table_security_net_ops); + ret = ip6table_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); + } return ret; } static void __exit ip6table_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); } module_init(ip6table_security_init); diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 31ba7ca19757..051b6a6bfff6 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -21,6 +21,10 @@ #include <net/ipv6.h> #include <net/netfilter/ipv6/nf_nat_masquerade.h> +#define MAX_WORK_COUNT 16 + +static atomic_t v6_worker_count; + unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, const struct net_device *out) @@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; +struct masq_dev_work { + struct work_struct work; + struct net *net; + int ifindex; +}; + +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + long index; + + w = container_of(work, struct masq_dev_work, work); + + index = w->ifindex; + nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&v6_worker_count); + module_put(THIS_MODULE); +} + +/* ipv6 inet notifier is an atomic notifier, i.e. we cannot + * schedule. + * + * Unfortunately, nf_ct_iterate_cleanup can run for a long + * time if there are lots of conntracks and the system + * handles high softirq load, so it frequently calls cond_resched + * while iterating the conntrack table. + * + * So we defer nf_ct_iterate_cleanup walk to the system workqueue. + * + * As we can have 'a lot' of inet_events (depending on amount + * of ipv6 addresses being deleted), we also need to add an upper + * limit to the number of queued work items. + */ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; + const struct net_device *dev; + struct masq_dev_work *w; + struct net *net; + + if (event != NETDEV_DOWN || + atomic_read(&v6_worker_count) >= MAX_WORK_COUNT) + return NOTIFY_DONE; + + dev = ifa->idev->dev; + net = maybe_get_net(dev_net(dev)); + if (!net) + return NOTIFY_DONE; - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w) { + atomic_inc(&v6_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = dev->ifindex; + w->net = net; + schedule_work(&w->work); + + return NOTIFY_DONE; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); + return NOTIFY_DONE; } static struct notifier_block masq_inet_notifier = { diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index cd1ac1637a05..9597ffb74077 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 18f3498a6c80..e2ea31175ef9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + skb_postpush_rcsum(head, skb_network_header(head), + skb_network_header_len(head)); rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9a6b407f5840..f45b8ffc2840 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } @@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } @@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); break; @@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } return 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9977b6f19f2a..33f2820181f9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -327,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; + bool fatal; int err; sk = __inet6_lookup_established(net, &tcp_hashinfo, @@ -345,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } seq = ntohl(th->seq); + fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, fatal); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -400,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { @@ -1387,7 +1388,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); @@ -1395,24 +1396,24 @@ process: reqsk_put(req); goto discard_it; } - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); tcp_v6_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ac4e7e03dded..fd25e447a5fa 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -548,6 +548,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; + int harderr; int err; struct net *net = dev_net(skb->dev); @@ -559,26 +560,27 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } + harderr = icmpv6_err_convert(type, code, &err); + np = inet6_sk(sk); + if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); + if (np->pmtudisc != IPV6_PMTUDISC_DONT) + harderr = 1; } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; } - np = inet6_sk(sk); - - if (!icmpv6_err_convert(type, code, &err) && !np->recverr) - goto out; - - if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) - goto out; - - if (np->recverr) + if (!np->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + } sk->sk_err = err; sk->sk_error_report(sk); @@ -920,11 +922,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ + /* a return value > 0 means to resubmit the input */ if (ret > 0) - return -ret; + return ret; return 0; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 7441e1e63893..2b0fbe6929e8 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -81,12 +81,18 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, csum = skb_checksum(skb, 0, skb->len, 0); uh->check = udp_v6_check(skb->len, &ipv6h->saddr, &ipv6h->daddr, csum); - if (uh->check == 0) uh->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Check if there is enough headroom to insert fragment header. */ tnl_hlen = skb_tnl_header_len(skb); if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig new file mode 100644 index 000000000000..5db94d940ecc --- /dev/null +++ b/net/kcm/Kconfig @@ -0,0 +1,10 @@ + +config AF_KCM + tristate "KCM sockets" + depends on INET + select BPF_SYSCALL + ---help--- + KCM (Kernel Connection Multiplexor) sockets provide a method + for multiplexing messages of a message based application + protocol over kernel connectons (e.g. TCP connections). + diff --git a/net/kcm/Makefile b/net/kcm/Makefile new file mode 100644 index 000000000000..71256133e677 --- /dev/null +++ b/net/kcm/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_AF_KCM) += kcm.o + +kcm-y := kcmsock.o kcmproc.o diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c new file mode 100644 index 000000000000..738008726cc6 --- /dev/null +++ b/net/kcm/kcmproc.c @@ -0,0 +1,426 @@ +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/proc_fs.h> +#include <linux/rculist.h> +#include <linux/seq_file.h> +#include <linux/socket.h> +#include <net/inet_sock.h> +#include <net/kcm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/tcp.h> + +#ifdef CONFIG_PROC_FS +struct kcm_seq_muxinfo { + char *name; + const struct file_operations *seq_fops; + const struct seq_operations seq_ops; +}; + +static struct kcm_mux *kcm_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + return list_first_or_null_rcu(&knet->mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + + return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct kcm_mux *m; + + list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { + if (!pos) + return m; + --pos; + } + return NULL; +} + +static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *p; + + if (v == SEQ_START_TOKEN) + p = kcm_get_first(seq); + else + p = kcm_get_next(v); + ++*pos; + return p; +} + +static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + + if (!*pos) + return SEQ_START_TOKEN; + else + return kcm_get_idx(seq, *pos - 1); +} + +static void kcm_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +struct kcm_proc_mux_state { + struct seq_net_private p; + int idx; +}; + +static int kcm_seq_open(struct inode *inode, struct file *file) +{ + struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode); + int err; + + err = seq_open_net(inode, file, &muxinfo->seq_ops, + sizeof(struct kcm_proc_mux_state)); + if (err < 0) + return err; + return err; +} + +static void kcm_format_mux_header(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + seq_printf(seq, + "*** KCM statistics (%d MUX) ****\n", + knet->count); + + seq_printf(seq, + "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", + "Object", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Recv-Q", + "Rmem", + "Send-Q", + "Smem", + "Status"); + + /* XXX: pdsts header stuff here */ + seq_puts(seq, "\n"); +} + +static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", + kcm->index, + kcm->stats.rx_msgs, + kcm->stats.rx_bytes, + kcm->stats.tx_msgs, + kcm->stats.tx_bytes, + kcm->sk.sk_receive_queue.qlen, + sk_rmem_alloc_get(&kcm->sk), + kcm->sk.sk_write_queue.qlen, + "-"); + + if (kcm->tx_psock) + seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); + + if (kcm->tx_wait) + seq_puts(seq, "TxWait "); + + if (kcm->tx_wait_more) + seq_puts(seq, "WMore "); + + if (kcm->rx_wait) + seq_puts(seq, "RxWait "); + + seq_puts(seq, "\n"); +} + +static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", + psock->index, + psock->stats.rx_msgs, + psock->stats.rx_bytes, + psock->stats.tx_msgs, + psock->stats.tx_bytes, + psock->sk->sk_receive_queue.qlen, + atomic_read(&psock->sk->sk_rmem_alloc), + psock->sk->sk_write_queue.qlen, + atomic_read(&psock->sk->sk_wmem_alloc)); + + if (psock->done) + seq_puts(seq, "Done "); + + if (psock->tx_stopped) + seq_puts(seq, "TxStop "); + + if (psock->rx_stopped) + seq_puts(seq, "RxStop "); + + if (psock->tx_kcm) + seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); + + if (psock->ready_rx_msg) + seq_puts(seq, "RdyRx "); + + seq_puts(seq, "\n"); +} + +static void +kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) +{ + int i, len; + struct kcm_sock *kcm; + struct kcm_psock *psock; + + /* mux information */ + seq_printf(seq, + "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", + "mux", "", + mux->stats.rx_msgs, + mux->stats.rx_bytes, + mux->stats.tx_msgs, + mux->stats.tx_bytes, + "-", "-", "-", "-"); + + seq_printf(seq, "KCMs: %d, Psocks %d\n", + mux->kcm_socks_cnt, mux->psocks_cnt); + + /* kcm sock information */ + i = 0; + spin_lock_bh(&mux->lock); + list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { + kcm_format_sock(kcm, seq, i, &len); + i++; + } + i = 0; + list_for_each_entry(psock, &mux->psocks, psock_list) { + kcm_format_psock(psock, seq, i, &len); + i++; + } + spin_unlock_bh(&mux->lock); +} + +static int kcm_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_proc_mux_state *mux_state; + + mux_state = seq->private; + if (v == SEQ_START_TOKEN) { + mux_state->idx = 0; + kcm_format_mux_header(seq); + } else { + kcm_format_mux(v, mux_state->idx, seq); + mux_state->idx++; + } + return 0; +} + +static const struct file_operations kcm_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_seq_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static struct kcm_seq_muxinfo kcm_seq_muxinfo = { + .name = "kcm", + .seq_fops = &kcm_seq_fops, + .seq_ops = { + .show = kcm_seq_show, + .start = kcm_seq_start, + .next = kcm_seq_next, + .stop = kcm_seq_stop, + } +}; + +static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net, + muxinfo->seq_fops, muxinfo); + if (!p) + rc = -ENOMEM; + return rc; +} +EXPORT_SYMBOL(kcm_proc_register); + +static void kcm_proc_unregister(struct net *net, + struct kcm_seq_muxinfo *muxinfo) +{ + remove_proc_entry(muxinfo->name, net->proc_net); +} +EXPORT_SYMBOL(kcm_proc_unregister); + +static int kcm_stats_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_psock_stats psock_stats; + struct kcm_mux_stats mux_stats; + struct kcm_mux *mux; + struct kcm_psock *psock; + struct net *net = seq->private; + struct kcm_net *knet = net_generic(net, kcm_net_id); + + memset(&mux_stats, 0, sizeof(mux_stats)); + memset(&psock_stats, 0, sizeof(psock_stats)); + + mutex_lock(&knet->mutex); + + aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); + aggregate_psock_stats(&knet->aggregate_psock_stats, + &psock_stats); + + list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { + spin_lock_bh(&mux->lock); + aggregate_mux_stats(&mux->stats, &mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &psock_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_psock_stats(&psock->stats, &psock_stats); + spin_unlock_bh(&mux->lock); + } + + mutex_unlock(&knet->mutex); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", + "MUX", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "TX-Retries", + "Attach", + "Unattach", + "UnattchRsvd", + "RX-RdyDrops"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", + "", + mux_stats.rx_msgs, + mux_stats.rx_bytes, + mux_stats.tx_msgs, + mux_stats.tx_bytes, + mux_stats.tx_retries, + mux_stats.psock_attach, + mux_stats.psock_unattach_rsvd, + mux_stats.psock_unattach, + mux_stats.rx_ready_drops); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "Psock", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Reserved", + "Unreserved", + "RX-Aborts", + "RX-MemFail", + "RX-NeedMor", + "RX-BadLen", + "RX-TooBig", + "RX-Timeout", + "TX-Aborts"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "", + psock_stats.rx_msgs, + psock_stats.rx_bytes, + psock_stats.tx_msgs, + psock_stats.tx_bytes, + psock_stats.reserved, + psock_stats.unreserved, + psock_stats.rx_aborts, + psock_stats.rx_mem_fail, + psock_stats.rx_need_more_hdr, + psock_stats.rx_bad_hdr_len, + psock_stats.rx_msg_too_big, + psock_stats.rx_msg_timeouts, + psock_stats.tx_aborts); + + return 0; +} + +static int kcm_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, kcm_stats_seq_show); +} + +static const struct file_operations kcm_stats_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int kcm_proc_init_net(struct net *net) +{ + int err; + + if (!proc_create("kcm_stats", S_IRUGO, net->proc_net, + &kcm_stats_seq_fops)) { + err = -ENOMEM; + goto out_kcm_stats; + } + + err = kcm_proc_register(net, &kcm_seq_muxinfo); + if (err) + goto out_kcm; + + return 0; + +out_kcm: + remove_proc_entry("kcm_stats", net->proc_net); +out_kcm_stats: + return err; +} + +static void kcm_proc_exit_net(struct net *net) +{ + kcm_proc_unregister(net, &kcm_seq_muxinfo); + remove_proc_entry("kcm_stats", net->proc_net); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_proc_init_net, + .exit = kcm_proc_exit_net, +}; + +int __init kcm_proc_init(void) +{ + return register_pernet_subsys(&kcm_net_ops); +} + +void __exit kcm_proc_exit(void) +{ + unregister_pernet_subsys(&kcm_net_ops); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c new file mode 100644 index 000000000000..40662d73204f --- /dev/null +++ b/net/kcm/kcmsock.c @@ -0,0 +1,2409 @@ +#include <linux/bpf.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <net/kcm.h> +#include <net/netns/generic.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <uapi/linux/kcm.h> + +unsigned int kcm_net_id; + +static struct kmem_cache *kcm_psockp __read_mostly; +static struct kmem_cache *kcm_muxp __read_mostly; +static struct workqueue_struct *kcm_wq; + +static inline struct kcm_sock *kcm_sk(const struct sock *sk) +{ + return (struct kcm_sock *)sk; +} + +static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) +{ + return (struct kcm_tx_msg *)skb->cb; +} + +static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) +{ + return (struct kcm_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +static void report_csk_error(struct sock *csk, int err) +{ + csk->sk_err = EPIPE; + csk->sk_error_report(csk); +} + +/* Callback lock held */ +static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, + struct sk_buff *skb) +{ + struct sock *csk = psock->sk; + + /* Unrecoverable error in receive */ + + del_timer(&psock->rx_msg_timer); + + if (psock->rx_stopped) + return; + + psock->rx_stopped = 1; + KCM_STATS_INCR(psock->stats.rx_aborts); + + /* Report an error on the lower socket */ + report_csk_error(csk, err); +} + +static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, + bool wakeup_kcm) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Unrecoverable error in transmit */ + + spin_lock_bh(&mux->lock); + + if (psock->tx_stopped) { + spin_unlock_bh(&mux->lock); + return; + } + + psock->tx_stopped = 1; + KCM_STATS_INCR(psock->stats.tx_aborts); + + if (!psock->tx_kcm) { + /* Take off psocks_avail list */ + list_del(&psock->psock_avail_list); + } else if (wakeup_kcm) { + /* In this case psock is being aborted while outside of + * write_msgs and psock is reserved. Schedule tx_work + * to handle the failure there. Need to commit tx_stopped + * before queuing work. + */ + smp_mb(); + + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + } + + spin_unlock_bh(&mux->lock); + + /* Report error on lower socket */ + report_csk_error(csk, err); +} + +/* RX mux lock held. */ +static void kcm_update_rx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.rx_bytes, + psock->stats.rx_bytes - psock->saved_rx_bytes); + mux->stats.rx_msgs += + psock->stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->stats.rx_msgs; + psock->saved_rx_bytes = psock->stats.rx_bytes; +} + +static void kcm_update_tx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.tx_bytes, + psock->stats.tx_bytes - psock->saved_tx_bytes); + mux->stats.tx_msgs += + psock->stats.tx_msgs - psock->saved_tx_msgs; + psock->saved_tx_msgs = psock->stats.tx_msgs; + psock->saved_tx_bytes = psock->stats.tx_bytes; +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +/* KCM is ready to receive messages on its queue-- either the KCM is new or + * has become unblocked after being blocked on full socket buffer. Queue any + * pending ready messages on a psock. RX mux lock held. + */ +static void kcm_rcv_ready(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct sk_buff *skb; + + if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) + return; + + while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Assuming buffer limit has been reached */ + skb_queue_head(&mux->rx_hold_queue, skb); + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + } + + while (!list_empty(&mux->psocks_ready)) { + psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, + psock_ready_list); + + if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { + /* Assuming buffer limit has been reached */ + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + + /* Consumed the ready message on the psock. Schedule rx_work to + * get more messages. + */ + list_del(&psock->psock_ready_list); + psock->ready_rx_msg = NULL; + + /* Commit clearing of ready_rx_msg for queuing work */ + smp_mb(); + + queue_work(kcm_wq, &psock->rx_work); + } + + /* Buffer limit is okay now, add to ready list */ + list_add_tail(&kcm->wait_rx_list, + &kcm->mux->kcm_rx_waiters); + kcm->rx_wait = true; +} + +static void kcm_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct kcm_mux *mux = kcm->mux; + unsigned int len = skb->truesize; + + sk_mem_uncharge(sk, len); + atomic_sub(len, &sk->sk_rmem_alloc); + + /* For reading rx_wait and rx_psock without holding lock */ + smp_mb__after_atomic(); + + if (!kcm->rx_wait && !kcm->rx_psock && + sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); + } +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + return -ENOMEM; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return -ENOBUFS; + + skb->dev = NULL; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = kcm_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); + + skb_queue_tail(list, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + + return 0; +} + +/* Requeue received messages for a kcm socket to other kcm sockets. This is + * called with a kcm socket is receive disabled. + * RX mux lock held. + */ +static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) +{ + struct sk_buff *skb; + struct kcm_sock *kcm; + + while ((skb = __skb_dequeue(head))) { + /* Reset destructor to avoid calling kcm_rcv_ready */ + skb->destructor = sock_rfree; + skb_orphan(skb); +try_again: + if (list_empty(&mux->kcm_rx_waiters)) { + skb_queue_tail(&mux->rx_hold_queue, skb); + continue; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + /* Commit rx_wait to read in kcm_free */ + smp_wmb(); + + goto try_again; + } + } +} + +/* Lower sock lock held */ +static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, + struct sk_buff *head) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + WARN_ON(psock->ready_rx_msg); + + if (psock->rx_kcm) + return psock->rx_kcm; + + spin_lock_bh(&mux->rx_lock); + + if (psock->rx_kcm) { + spin_unlock_bh(&mux->rx_lock); + return psock->rx_kcm; + } + + kcm_update_rx_mux_stats(mux, psock); + + if (list_empty(&mux->kcm_rx_waiters)) { + psock->ready_rx_msg = head; + list_add_tail(&psock->psock_ready_list, + &mux->psocks_ready); + spin_unlock_bh(&mux->rx_lock); + return NULL; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + psock->rx_kcm = kcm; + kcm->rx_psock = psock; + + spin_unlock_bh(&mux->rx_lock); + + return kcm; +} + +static void kcm_done(struct kcm_sock *kcm); + +static void kcm_done_work(struct work_struct *w) +{ + kcm_done(container_of(w, struct kcm_sock, done_work)); +} + +/* Lower sock held */ +static void unreserve_rx_kcm(struct kcm_psock *psock, + bool rcv_ready) +{ + struct kcm_sock *kcm = psock->rx_kcm; + struct kcm_mux *mux = psock->mux; + + if (!kcm) + return; + + spin_lock_bh(&mux->rx_lock); + + psock->rx_kcm = NULL; + kcm->rx_psock = NULL; + + /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with + * kcm_rfree + */ + smp_mb(); + + if (unlikely(kcm->done)) { + spin_unlock_bh(&mux->rx_lock); + + /* Need to run kcm_done in a task since we need to qcquire + * callback locks which may already be held here. + */ + INIT_WORK(&kcm->done_work, kcm_done_work); + schedule_work(&kcm->done_work); + return; + } + + if (unlikely(kcm->rx_disabled)) { + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { + /* Check for degenerative race with rx_wait that all + * data was dequeued (accounted for in kcm_rfree). + */ + kcm_rcv_ready(kcm); + } + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_start_rx_timer(struct kcm_psock *psock) +{ + if (psock->sk->sk_rcvtimeo) + mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); +} + +/* Macro to invoke filter function. */ +#define KCM_RUN_FILTER(prog, ctx) \ + (*prog->bpf_func)(ctx, prog->insnsi) + +/* Lower socket lock held */ +static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; + struct kcm_rx_msg *rxm; + struct kcm_sock *kcm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (psock->ready_rx_msg) + return 0; + + head = psock->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = kcm_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!psock->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *kcm_rx_msg(skb) = *kcm_rx_msg(head); + psock->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + psock->rx_skb_head = skb; + head = skb; + } else { + psock->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = psock->rx_skb_head; + if (!head) { + head = skb; + psock->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + psock->rx_skb_nextp = NULL; + rxm = kcm_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = kcm_rx_msg(head); + *psock->rx_skb_nextp = skb; + psock->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->full_len) { + ssize_t len; + + len = KCM_RUN_FILTER(psock->bpf_prog, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + rxm->accum_len += cand_len; + eaten += cand_len; + KCM_STATS_INCR(psock->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len > psock->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + KCM_STATS_INCR(psock->stats.rx_msg_too_big); + desc->error = -EMSGSIZE; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EMSGSIZE, head); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); + desc->error = -EPROTO; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EPROTO, head); + break; + } + + rxm->full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->full_len - rxm->accum_len > + tcp_inq(psock->sk)) { + /* Don't have the whole messages in the socket + * buffer. Set psock->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per tcp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + + psock->rx_need_bytes = rxm->full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&psock->rx_msg_timer); + psock->rx_skb_head = NULL; + KCM_STATS_INCR(psock->stats.rx_msgs); + +try_queue: + kcm = reserve_rx_kcm(psock, head); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock. */ + break; + } + + if (kcm_queue_rcv_skb(&kcm->sk, head)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + KCM_STATS_ADD(psock->stats.rx_bytes, eaten); + + return eaten; +} + +/* Called with lock held on lower socket */ +static int psock_tcp_read_sock(struct kcm_psock *psock) +{ + read_descriptor_t desc; + + desc.arg.data = psock; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do tcp_read_sock */ + tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); + + unreserve_rx_kcm(psock, true); + + return desc.error; +} + +/* Lower sock lock held */ +static void psock_tcp_data_ready(struct sock *sk) +{ + struct kcm_psock *psock; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock || psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + if (psock->rx_need_bytes) { + if (tcp_inq(sk) >= psock->rx_need_bytes) + psock->rx_need_bytes = 0; + else + goto out; + } + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void do_psock_rx_work(struct kcm_psock *psock) +{ + read_descriptor_t rd_desc; + struct sock *csk = psock->sk; + + /* We need the read lock to synchronize with psock_tcp_data_ready. We + * need the socket lock for calling tcp_read_sock. + */ + lock_sock(csk); + read_lock_bh(&csk->sk_callback_lock); + + if (unlikely(csk->sk_user_data != psock)) + goto out; + + if (unlikely(psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + rd_desc.arg.data = psock; + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&csk->sk_callback_lock); + release_sock(csk); +} + +static void psock_rx_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); +} + +static void psock_rx_delayed_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, + rx_delayed_work.work)); +} + +static void psock_tcp_state_change(struct sock *sk) +{ + /* TCP only does a POLLIN for a half close. Do a POLLHUP here + * since application will normally not poll with POLLIN + * on the TCP sockets. + */ + + report_csk_error(sk, EPIPE); +} + +static void psock_tcp_write_space(struct sock *sk) +{ + struct kcm_psock *psock; + struct kcm_mux *mux; + struct kcm_sock *kcm; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock)) + goto out; + + mux = psock->mux; + + spin_lock_bh(&mux->lock); + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; + if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void unreserve_psock(struct kcm_sock *kcm); + +/* kcm sock is locked. */ +static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + + psock = kcm->tx_psock; + + smp_rmb(); /* Must read tx_psock before tx_wait */ + + if (psock) { + WARN_ON(kcm->tx_wait); + if (unlikely(psock->tx_stopped)) + unreserve_psock(kcm); + else + return kcm->tx_psock; + } + + spin_lock_bh(&mux->lock); + + /* Check again under lock to see if psock was reserved for this + * psock via psock_unreserve. + */ + psock = kcm->tx_psock; + if (unlikely(psock)) { + WARN_ON(kcm->tx_wait); + spin_unlock_bh(&mux->lock); + return kcm->tx_psock; + } + + if (!list_empty(&mux->psocks_avail)) { + psock = list_first_entry(&mux->psocks_avail, + struct kcm_psock, + psock_avail_list); + list_del(&psock->psock_avail_list); + if (kcm->tx_wait) { + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + kcm->tx_psock = psock; + psock->tx_kcm = kcm; + KCM_STATS_INCR(psock->stats.reserved); + } else if (!kcm->tx_wait) { + list_add_tail(&kcm->wait_psock_list, + &mux->kcm_tx_waiters); + kcm->tx_wait = true; + } + + spin_unlock_bh(&mux->lock); + + return psock; +} + +/* mux lock held */ +static void psock_now_avail(struct kcm_psock *psock) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + if (list_empty(&mux->kcm_tx_waiters)) { + list_add_tail(&psock->psock_avail_list, + &mux->psocks_avail); + } else { + kcm = list_first_entry(&mux->kcm_tx_waiters, + struct kcm_sock, + wait_psock_list); + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + psock->tx_kcm = kcm; + + /* Commit before changing tx_psock since that is read in + * reserve_psock before queuing work. + */ + smp_mb(); + + kcm->tx_psock = psock; + KCM_STATS_INCR(psock->stats.reserved); + queue_work(kcm_wq, &kcm->tx_work); + } +} + +/* kcm sock is locked. */ +static void unreserve_psock(struct kcm_sock *kcm) +{ + struct kcm_psock *psock; + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + + psock = kcm->tx_psock; + + if (WARN_ON(!psock)) { + spin_unlock_bh(&mux->lock); + return; + } + + smp_rmb(); /* Read tx_psock before tx_wait */ + + kcm_update_tx_mux_stats(mux, psock); + + WARN_ON(kcm->tx_wait); + + kcm->tx_psock = NULL; + psock->tx_kcm = NULL; + KCM_STATS_INCR(psock->stats.unreserved); + + if (unlikely(psock->tx_stopped)) { + if (psock->done) { + /* Deferred free */ + list_del(&psock->psock_list); + mux->psocks_cnt--; + sock_put(psock->sk); + fput(psock->sk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } + + /* Don't put back on available list */ + + spin_unlock_bh(&mux->lock); + + return; + } + + psock_now_avail(psock); + + spin_unlock_bh(&mux->lock); +} + +static void kcm_report_tx_retry(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + KCM_STATS_INCR(mux->stats.tx_retries); + spin_unlock_bh(&mux->lock); +} + +/* Write any messages ready on the kcm socket. Called with kcm sock lock + * held. Return bytes actually sent or error. + */ +static int kcm_write_msgs(struct kcm_sock *kcm) +{ + struct sock *sk = &kcm->sk; + struct kcm_psock *psock; + struct sk_buff *skb, *head; + struct kcm_tx_msg *txm; + unsigned short fragidx, frag_offset; + unsigned int sent, total_sent = 0; + int ret = 0; + + kcm->tx_wait_more = false; + psock = kcm->tx_psock; + if (unlikely(psock && psock->tx_stopped)) { + /* A reserved psock was aborted asynchronously. Unreserve + * it and we'll retry the message. + */ + unreserve_psock(kcm); + kcm_report_tx_retry(kcm); + if (skb_queue_empty(&sk->sk_write_queue)) + return 0; + + kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; + + } else if (skb_queue_empty(&sk->sk_write_queue)) { + return 0; + } + + head = skb_peek(&sk->sk_write_queue); + txm = kcm_tx_msg(head); + + if (txm->sent) { + /* Send of first skbuff in queue already in progress */ + if (WARN_ON(!psock)) { + ret = -EINVAL; + goto out; + } + sent = txm->sent; + frag_offset = txm->frag_offset; + fragidx = txm->fragidx; + skb = txm->frag_skb; + + goto do_frag; + } + +try_again: + psock = reserve_psock(kcm); + if (!psock) + goto out; + + do { + skb = head; + txm = kcm_tx_msg(head); + sent = 0; + +do_frag_list: + if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { + ret = -EINVAL; + goto out; + } + + for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; + fragidx++) { + skb_frag_t *frag; + + frag_offset = 0; +do_frag: + frag = &skb_shinfo(skb)->frags[fragidx]; + if (WARN_ON(!frag->size)) { + ret = -EINVAL; + goto out; + } + + ret = kernel_sendpage(psock->sk->sk_socket, + frag->page.p, + frag->page_offset + frag_offset, + frag->size - frag_offset, + MSG_DONTWAIT); + if (ret <= 0) { + if (ret == -EAGAIN) { + /* Save state to try again when there's + * write space on the socket + */ + txm->sent = sent; + txm->frag_offset = frag_offset; + txm->fragidx = fragidx; + txm->frag_skb = skb; + + ret = 0; + goto out; + } + + /* Hard failure in sending message, abort this + * psock since it has lost framing + * synchonization and retry sending the + * message from the beginning. + */ + kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, + true); + unreserve_psock(kcm); + + txm->sent = 0; + kcm_report_tx_retry(kcm); + ret = 0; + + goto try_again; + } + + sent += ret; + frag_offset += ret; + KCM_STATS_ADD(psock->stats.tx_bytes, ret); + if (frag_offset < frag->size) { + /* Not finished with this frag */ + goto do_frag; + } + } + + if (skb == head) { + if (skb_has_frag_list(skb)) { + skb = skb_shinfo(skb)->frag_list; + goto do_frag_list; + } + } else if (skb->next) { + skb = skb->next; + goto do_frag_list; + } + + /* Successfully sent the whole packet, account for it. */ + skb_dequeue(&sk->sk_write_queue); + kfree_skb(head); + sk->sk_wmem_queued -= sent; + total_sent += sent; + KCM_STATS_INCR(psock->stats.tx_msgs); + } while ((head = skb_peek(&sk->sk_write_queue))); +out: + if (!head) { + /* Done with all queued messages. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + unreserve_psock(kcm); + } + + /* Check if write space is available */ + sk->sk_write_space(sk); + + return total_sent ? : ret; +} + +static void kcm_tx_work(struct work_struct *w) +{ + struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); + struct sock *sk = &kcm->sk; + int err; + + lock_sock(sk); + + /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx + * aborts + */ + err = kcm_write_msgs(kcm); + if (err < 0) { + /* Hard failure in write, report error on KCM socket */ + pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); + report_csk_error(&kcm->sk, -err); + goto out; + } + + /* Primarily for SOCK_SEQPACKET sockets */ + if (likely(sk->sk_socket) && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_space(sk); + } + +out: + release_sock(sk); +} + +static void kcm_push(struct kcm_sock *kcm) +{ + if (kcm->tx_wait_more) + kcm_write_msgs(kcm); +} + +static ssize_t kcm_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) + +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + bool eor; + int err = 0; + int i; + + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + + /* No MSG_EOR from splice, only look at MSG_MORE */ + eor = !(flags & MSG_MORE); + + lock_sock(sk); + + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + err = -EPIPE; + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + goto coalesced; + } + + if (i >= MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + while (!tskb) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + i = 0; + } + } else { + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + skb = head; + i = 0; + } + + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + +coalesced: + skb->len += size; + skb->data_len += size; + skb->truesize += size; + sk->sk_wmem_queued += size; + sk_mem_charge(sk, size); + + if (head != skb) { + head->len += size; + head->data_len += size; + head->truesize += size; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, size); + + release_sock(sk); + return size; + +out_error: + kcm_push(kcm); + + err = sk_stream_error(sk, flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + size_t copy, copied = 0; + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + int eor = (sock->type == SOCK_DGRAM) ? + !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); + int err = -EPIPE; + + lock_sock(sk); + + /* Per tcp_sendmsg this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + goto start; + } + + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + /* New message, alloc head skb */ + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + + head = alloc_skb(0, sk->sk_allocation); + } + + skb = head; + + /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling + * csum_and_copy_from_iter from skb_do_copy_data_nocache. + */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + +start: + while (msg_data_left(msg)) { + bool merge = true; + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_memory; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i == MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + if (!tskb) + goto wait_for_memory; + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + continue; + } + merge = false; + } + + copy = min_t(int, msg_data_left(msg), + pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto out_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + + pfrag->offset += copy; + copied += copy; + if (head != skb) { + head->len += copy; + head->data_len += copy; + } + + continue; + +wait_for_memory: + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (msg->msg_flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ +partial_message: + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, copied); + + release_sock(sk); + return copied; + +out_error: + kcm_push(kcm); + + if (copied && sock->type == SOCK_SEQPACKET) { + /* Wrote some bytes before encountering an + * error, return partial success. + */ + goto partial_message; + } + + if (head != kcm->seq_skb) + kfree_skb(head); + + err = sk_stream_error(sk, msg->msg_flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct sk_buff *skb; + + while (!(skb = skb_peek(&sk->sk_receive_queue))) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + sk_wait_data(sk, &timeo, NULL); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + int err = 0; + long timeo; + struct kcm_rx_msg *rxm; + int copied = 0; + struct sk_buff *skb; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); + if (err < 0) + goto out; + + copied = len; + if (likely(!(flags & MSG_PEEK))) { + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + if (copied < rxm->full_len) { + if (sock->type == SOCK_DGRAM) { + /* Truncated message */ + msg->msg_flags |= MSG_TRUNC; + goto msg_finished; + } + rxm->offset += copied; + rxm->full_len -= copied; + } else { +msg_finished: + /* Finished with message */ + msg->msg_flags |= MSG_EOR; + KCM_STATS_INCR(kcm->stats.rx_msgs); + skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); + } + } + +out: + release_sock(sk); + + return copied ? : err; +} + +static ssize_t kcm_sock_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + +static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + long timeo; + struct kcm_rx_msg *rxm; + int err = 0; + size_t copied; + struct sk_buff *skb; + + /* Only support splice for SOCKSEQPACKET */ + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto err_out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, + kcm_sock_splice); + if (copied < 0) { + err = copied; + goto err_out; + } + + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + + rxm->offset += copied; + rxm->full_len -= copied; + + /* We have no way to return MSG_EOR. If all the bytes have been + * read we still leave the message in the receive socket buffer. + * A subsequent recvmsg needs to be done to return MSG_EOR and + * finish reading the message. + */ + + release_sock(sk); + + return copied; + +err_out: + release_sock(sk); + + return err; +} + +/* kcm sock lock held */ +static void kcm_recv_disable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 1; + + /* If a psock is reserved we'll do cleanup in unreserve */ + if (!kcm->rx_psock) { + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } + + spin_unlock_bh(&mux->rx_lock); +} + +/* kcm sock lock held */ +static void kcm_recv_enable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (!kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 0; + kcm_rcv_ready(kcm); + + spin_unlock_bh(&mux->rx_lock); +} + +static int kcm_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, valbool; + int err = 0; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EINVAL; + + valbool = val ? 1 : 0; + + switch (optname) { + case KCM_RECV_DISABLE: + lock_sock(&kcm->sk); + if (valbool) + kcm_recv_disable(kcm); + else + kcm_recv_enable(kcm); + release_sock(&kcm->sk); + break; + default: + err = -ENOPROTOOPT; + } + + return err; +} + +static int kcm_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, len; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + switch (optname) { + case KCM_RECV_DISABLE: + val = kcm->rx_disabled; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) +{ + struct kcm_sock *tkcm; + struct list_head *head; + int index = 0; + + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * POLLHUP + */ + kcm->sk.sk_state = TCP_ESTABLISHED; + + /* Add to mux's kcm sockets list */ + kcm->mux = mux; + spin_lock_bh(&mux->lock); + + head = &mux->kcm_socks; + list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { + if (tkcm->index != index) + break; + head = &tkcm->kcm_sock_list; + index++; + } + + list_add(&kcm->kcm_sock_list, head); + kcm->index = index; + + mux->kcm_socks_cnt++; + spin_unlock_bh(&mux->lock); + + INIT_WORK(&kcm->tx_work, kcm_tx_work); + + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_rx_msg_timeout(unsigned long arg) +{ + struct kcm_psock *psock = (struct kcm_psock *)arg; + + /* Message assembly timed out */ + KCM_STATS_INCR(psock->stats.rx_msg_timeouts); + kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); +} + +static int kcm_attach(struct socket *sock, struct socket *csock, + struct bpf_prog *prog) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct sock *csk; + struct kcm_psock *psock = NULL, *tpsock; + struct list_head *head; + int index = 0; + + if (csock->ops->family != PF_INET && + csock->ops->family != PF_INET6) + return -EINVAL; + + csk = csock->sk; + if (!csk) + return -EINVAL; + + /* Only support TCP for now */ + if (csk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); + if (!psock) + return -ENOMEM; + + psock->mux = mux; + psock->sk = csk; + psock->bpf_prog = prog; + + setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, + (unsigned long)psock); + + INIT_WORK(&psock->rx_work, psock_rx_work); + INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + + sock_hold(csk); + + write_lock_bh(&csk->sk_callback_lock); + psock->save_data_ready = csk->sk_data_ready; + psock->save_write_space = csk->sk_write_space; + psock->save_state_change = csk->sk_state_change; + csk->sk_user_data = psock; + csk->sk_data_ready = psock_tcp_data_ready; + csk->sk_write_space = psock_tcp_write_space; + csk->sk_state_change = psock_tcp_state_change; + write_unlock_bh(&csk->sk_callback_lock); + + /* Finished initialization, now add the psock to the MUX. */ + spin_lock_bh(&mux->lock); + head = &mux->psocks; + list_for_each_entry(tpsock, &mux->psocks, psock_list) { + if (tpsock->index != index) + break; + head = &tpsock->psock_list; + index++; + } + + list_add(&psock->psock_list, head); + psock->index = index; + + KCM_STATS_INCR(mux->stats.psock_attach); + mux->psocks_cnt++; + psock_now_avail(psock); + spin_unlock_bh(&mux->lock); + + /* Schedule RX work in case there are already bytes queued */ + queue_work(kcm_wq, &psock->rx_work); + + return 0; +} + +static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) +{ + struct socket *csock; + struct bpf_prog *prog; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + prog = bpf_prog_get(info->bpf_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out; + } + + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + err = -EINVAL; + goto out; + } + + err = kcm_attach(sock, csock, prog); + if (err) { + bpf_prog_put(prog); + goto out; + } + + /* Keep reference on file also */ + + return 0; +out: + fput(csock->file); + return err; +} + +static void kcm_unattach(struct kcm_psock *psock) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Stop getting callbacks from TCP socket. After this there should + * be no way to reserve a kcm for this psock. + */ + write_lock_bh(&csk->sk_callback_lock); + csk->sk_user_data = NULL; + csk->sk_data_ready = psock->save_data_ready; + csk->sk_write_space = psock->save_write_space; + csk->sk_state_change = psock->save_state_change; + psock->rx_stopped = 1; + + if (WARN_ON(psock->rx_kcm)) { + write_unlock_bh(&csk->sk_callback_lock); + return; + } + + spin_lock_bh(&mux->rx_lock); + + /* Stop receiver activities. After this point psock should not be + * able to get onto ready list either through callbacks or work. + */ + if (psock->ready_rx_msg) { + list_del(&psock->psock_ready_list); + kfree_skb(psock->ready_rx_msg); + psock->ready_rx_msg = NULL; + KCM_STATS_INCR(mux->stats.rx_ready_drops); + } + + spin_unlock_bh(&mux->rx_lock); + + write_unlock_bh(&csk->sk_callback_lock); + + del_timer_sync(&psock->rx_msg_timer); + cancel_work_sync(&psock->rx_work); + cancel_delayed_work_sync(&psock->rx_delayed_work); + + bpf_prog_put(psock->bpf_prog); + + kfree_skb(psock->rx_skb_head); + psock->rx_skb_head = NULL; + + spin_lock_bh(&mux->lock); + + aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + + KCM_STATS_INCR(mux->stats.psock_unattach); + + if (psock->tx_kcm) { + /* psock was reserved. Just mark it finished and we will clean + * up in the kcm paths, we need kcm lock which can not be + * acquired here. + */ + KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); + spin_unlock_bh(&mux->lock); + + /* We are unattaching a socket that is reserved. Abort the + * socket since we may be out of sync in sending on it. We need + * to do this without the mux lock. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + + spin_lock_bh(&mux->lock); + if (!psock->tx_kcm) { + /* psock now unreserved in window mux was unlocked */ + goto no_reserved; + } + psock->done = 1; + + /* Commit done before queuing work to process it */ + smp_mb(); + + /* Queue tx work to make sure psock->done is handled */ + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + spin_unlock_bh(&mux->lock); + } else { +no_reserved: + if (!psock->tx_stopped) + list_del(&psock->psock_avail_list); + list_del(&psock->psock_list); + mux->psocks_cnt--; + spin_unlock_bh(&mux->lock); + + sock_put(csk); + fput(csk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } +} + +static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct socket *csock; + struct sock *csk; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + csk = csock->sk; + if (!csk) { + err = -EINVAL; + goto out; + } + + err = -ENOENT; + + spin_lock_bh(&mux->lock); + + list_for_each_entry(psock, &mux->psocks, psock_list) { + if (psock->sk != csk) + continue; + + /* Found the matching psock */ + + if (psock->unattaching || WARN_ON(psock->done)) { + err = -EALREADY; + break; + } + + psock->unattaching = 1; + + spin_unlock_bh(&mux->lock); + + kcm_unattach(psock); + + err = 0; + goto out; + } + + spin_unlock_bh(&mux->lock); + +out: + fput(csock->file); + return err; +} + +static struct proto kcm_proto = { + .name = "KCM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct kcm_sock), +}; + +/* Clone a kcm socket. */ +static int kcm_clone(struct socket *osock, struct kcm_clone *info, + struct socket **newsockp) +{ + struct socket *newsock; + struct sock *newsk; + struct file *newfile; + int err, newfd; + + err = -ENFILE; + newsock = sock_alloc(); + if (!newsock) + goto out; + + newsock->type = osock->type; + newsock->ops = osock->ops; + + __module_get(newsock->ops->owner); + + newfd = get_unused_fd_flags(0); + if (unlikely(newfd < 0)) { + err = newfd; + goto out_fd_fail; + } + + newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); + if (unlikely(IS_ERR(newfile))) { + err = PTR_ERR(newfile); + goto out_sock_alloc_fail; + } + + newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, + &kcm_proto, true); + if (!newsk) { + err = -ENOMEM; + goto out_sk_alloc_fail; + } + + sock_init_data(newsock, newsk); + init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); + + fd_install(newfd, newfile); + *newsockp = newsock; + info->fd = newfd; + + return 0; + +out_sk_alloc_fail: + fput(newfile); +out_sock_alloc_fail: + put_unused_fd(newfd); +out_fd_fail: + sock_release(newsock); +out: + return err; +} + +static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case SIOCKCMATTACH: { + struct kcm_attach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_attach_ioctl(sock, &info); + + break; + } + case SIOCKCMUNATTACH: { + struct kcm_unattach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_unattach_ioctl(sock, &info); + + break; + } + case SIOCKCMCLONE: { + struct kcm_clone info; + struct socket *newsock = NULL; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_clone(sock, &info, &newsock); + + if (!err) { + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + err = -EFAULT; + sock_release(newsock); + } + } + + break; + } + default: + err = -ENOIOCTLCMD; + break; + } + + return err; +} + +static void free_mux(struct rcu_head *rcu) +{ + struct kcm_mux *mux = container_of(rcu, + struct kcm_mux, rcu); + + kmem_cache_free(kcm_muxp, mux); +} + +static void release_mux(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + struct kcm_psock *psock, *tmp_psock; + + /* Release psocks */ + list_for_each_entry_safe(psock, tmp_psock, + &mux->psocks, psock_list) { + if (!WARN_ON(psock->unattaching)) + kcm_unattach(psock); + } + + if (WARN_ON(mux->psocks_cnt)) + return; + + __skb_queue_purge(&mux->rx_hold_queue); + + mutex_lock(&knet->mutex); + aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &knet->aggregate_psock_stats); + list_del_rcu(&mux->kcm_mux_list); + knet->count--; + mutex_unlock(&knet->mutex); + + call_rcu(&mux->rcu, free_mux); +} + +static void kcm_done(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct sock *sk = &kcm->sk; + int socks_cnt; + + spin_lock_bh(&mux->rx_lock); + if (kcm->rx_psock) { + /* Cleanup in unreserve_rx_kcm */ + WARN_ON(kcm->done); + kcm->rx_disabled = 1; + kcm->done = 1; + spin_unlock_bh(&mux->rx_lock); + return; + } + + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + /* Move any pending receive messages to other kcm sockets */ + requeue_rx_msgs(mux, &sk->sk_receive_queue); + + spin_unlock_bh(&mux->rx_lock); + + if (WARN_ON(sk_rmem_alloc_get(sk))) + return; + + /* Detach from MUX */ + spin_lock_bh(&mux->lock); + + list_del(&kcm->kcm_sock_list); + mux->kcm_socks_cnt--; + socks_cnt = mux->kcm_socks_cnt; + + spin_unlock_bh(&mux->lock); + + if (!socks_cnt) { + /* We are done with the mux now. */ + release_mux(mux); + } + + WARN_ON(kcm->rx_wait); + + sock_put(&kcm->sk); +} + +/* Called by kcm_release to close a KCM socket. + * If this is the last KCM socket on the MUX, destroy the MUX. + */ +static int kcm_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm; + struct kcm_mux *mux; + struct kcm_psock *psock; + + if (!sk) + return 0; + + kcm = kcm_sk(sk); + mux = kcm->mux; + + sock_orphan(sk); + kfree_skb(kcm->seq_skb); + + lock_sock(sk); + /* Purge queue under lock to avoid race condition with tx_work trying + * to act when queue is nonempty. If tx_work runs after this point + * it will just return. + */ + __skb_queue_purge(&sk->sk_write_queue); + release_sock(sk); + + spin_lock_bh(&mux->lock); + if (kcm->tx_wait) { + /* Take of tx_wait list, after this point there should be no way + * that a psock will be assigned to this kcm. + */ + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + spin_unlock_bh(&mux->lock); + + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ + cancel_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; + if (psock) { + /* A psock was reserved, so we need to kill it since it + * may already have some bytes queued from a message. We + * need to do this after removing kcm from tx_wait list. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + unreserve_psock(kcm); + } + release_sock(sk); + + WARN_ON(kcm->tx_wait); + WARN_ON(kcm->tx_psock); + + sock->sk = NULL; + + kcm_done(kcm); + + return 0; +} + +static const struct proto_ops kcm_dgram_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, +}; + +static const struct proto_ops kcm_seqpacket_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, + .splice_read = kcm_splice_read, +}; + +/* Create proto operation for kcm sockets */ +static int kcm_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct sock *sk; + struct kcm_mux *mux; + + switch (sock->type) { + case SOCK_DGRAM: + sock->ops = &kcm_dgram_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &kcm_seqpacket_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (protocol != KCMPROTO_CONNECTED) + return -EPROTONOSUPPORT; + + sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); + if (!sk) + return -ENOMEM; + + /* Allocate a kcm mux, shared between KCM sockets */ + mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); + if (!mux) { + sk_free(sk); + return -ENOMEM; + } + + spin_lock_init(&mux->lock); + spin_lock_init(&mux->rx_lock); + INIT_LIST_HEAD(&mux->kcm_socks); + INIT_LIST_HEAD(&mux->kcm_rx_waiters); + INIT_LIST_HEAD(&mux->kcm_tx_waiters); + + INIT_LIST_HEAD(&mux->psocks); + INIT_LIST_HEAD(&mux->psocks_ready); + INIT_LIST_HEAD(&mux->psocks_avail); + + mux->knet = knet; + + /* Add new MUX to list */ + mutex_lock(&knet->mutex); + list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); + knet->count++; + mutex_unlock(&knet->mutex); + + skb_queue_head_init(&mux->rx_hold_queue); + + /* Init KCM socket */ + sock_init_data(sock, sk); + init_kcm_sock(kcm_sk(sk), mux); + + return 0; +} + +static struct net_proto_family kcm_family_ops = { + .family = PF_KCM, + .create = kcm_create, + .owner = THIS_MODULE, +}; + +static __net_init int kcm_init_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + INIT_LIST_HEAD_RCU(&knet->mux_list); + mutex_init(&knet->mutex); + + return 0; +} + +static __net_exit void kcm_exit_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + /* All KCM sockets should be closed at this point, which should mean + * that all multiplexors and psocks have been destroyed. + */ + WARN_ON(!list_empty(&knet->mux_list)); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_init_net, + .exit = kcm_exit_net, + .id = &kcm_net_id, + .size = sizeof(struct kcm_net), +}; + +static int __init kcm_init(void) +{ + int err = -ENOMEM; + + kcm_muxp = kmem_cache_create("kcm_mux_cache", + sizeof(struct kcm_mux), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_muxp) + goto fail; + + kcm_psockp = kmem_cache_create("kcm_psock_cache", + sizeof(struct kcm_psock), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_psockp) + goto fail; + + kcm_wq = create_singlethread_workqueue("kkcmd"); + if (!kcm_wq) + goto fail; + + err = proto_register(&kcm_proto, 1); + if (err) + goto fail; + + err = sock_register(&kcm_family_ops); + if (err) + goto sock_register_fail; + + err = register_pernet_device(&kcm_net_ops); + if (err) + goto net_ops_fail; + + err = kcm_proc_init(); + if (err) + goto proc_init_fail; + + return 0; + +proc_init_fail: + unregister_pernet_device(&kcm_net_ops); + +net_ops_fail: + sock_unregister(PF_KCM); + +sock_register_fail: + proto_unregister(&kcm_proto); + +fail: + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); + + if (kcm_wq) + destroy_workqueue(kcm_wq); + + return err; +} + +static void __exit kcm_exit(void) +{ + kcm_proc_exit(); + unregister_pernet_device(&kcm_net_ops); + sock_unregister(PF_KCM); + proto_unregister(&kcm_proto); + destroy_workqueue(kcm_wq); + + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); +} + +module_init(kcm_init); +module_exit(kcm_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_KCM); + diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f93c5be612a7..2caaa84ce92d 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family, ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family, ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8e5ead366e7f..e925037fa0df 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -17,7 +17,7 @@ * @dev: targeted interface */ -int l3mdev_master_ifindex_rcu(struct net_device *dev) +int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; @@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev) ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; + struct net_device *_dev = (struct net_device *)dev; - master = netdev_master_upper_dev_get_rcu(dev); + /* netdev_master_upper_dev_get_rcu calls + * list_first_or_null_rcu to walk the upper dev list. + * list_first_or_null_rcu does not handle a const arg. We aren't + * making changes, just want the master device from that list so + * typecast to remove the const + */ + master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8dab4e569571..b3c52e3f689a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; -static int llc_ui_wait_for_conn(struct sock *sk, long timeout); +static long llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); @@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) return rc; } -static int llc_ui_wait_for_conn(struct sock *sk, long timeout) +static long llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT(wait); diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 10ad4ac1fa0b..3a8f881b22f1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -61,16 +62,25 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, { struct ieee80211_local *local = sta->local; struct tid_ampdu_rx *tid_rx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_RX_STOP, + .tid = tid, + .amsdu = false, + .timeout = 0, + .ssn = 0, + }; lockdep_assert_held(&sta->ampdu_mlme.mtx); tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid], lockdep_is_held(&sta->ampdu_mlme.mtx)); - if (!tid_rx) + if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid)) return; RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL); + __clear_bit(tid, sta->ampdu_mlme.agg_session_valid); ht_dbg(sta->sdata, "Rx BA session stop requested for %pM tid %u %s reason: %d\n", @@ -78,8 +88,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", (int)reason); - if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, - &sta->sta, tid, NULL, 0, false)) + if (drv_ampdu_action(local, sta->sdata, ¶ms)) sdata_info(sta->sdata, "HW problem - can not stop rx aggregation for %pM tid %d\n", sta->sta.addr, tid); @@ -89,6 +98,13 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, ieee80211_send_delba(sta->sdata, sta->sta.addr, tid, WLAN_BACK_RECIPIENT, reason); + /* + * return here in case tid_rx is not assigned - which will happen if + * IEEE80211_HW_SUPPORTS_REORDERING_BUFFER is set. + */ + if (!tid_rx) + return; + del_timer_sync(&tid_rx->session_timer); /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */ @@ -237,6 +253,15 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_RX_START, + .tid = tid, + .amsdu = false, + .timeout = timeout, + .ssn = start_seq_num, + }; + int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; @@ -275,11 +300,12 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > local->hw.max_rx_aggregation_subframes) buf_size = local->hw.max_rx_aggregation_subframes; + params.buf_size = buf_size; /* examine state machine */ mutex_lock(&sta->ampdu_mlme.mtx); - if (sta->ampdu_mlme.tid_rx[tid]) { + if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { ht_dbg_ratelimited(sta->sdata, "unexpected AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); @@ -290,8 +316,18 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, false); } + if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) { + ret = drv_ampdu_action(local, sta->sdata, ¶ms); + ht_dbg(sta->sdata, + "Rx A-MPDU request on %pM tid %d result %d\n", + sta->sta.addr, tid, ret); + if (!ret) + status = WLAN_STATUS_SUCCESS; + goto end; + } + /* prepare A-MPDU MLME for Rx aggregation */ - tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL); + tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL); if (!tid_agg_rx) goto end; @@ -322,8 +358,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, for (i = 0; i < buf_size; i++) __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]); - ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, - &sta->sta, tid, &start_seq_num, 0, false); + ret = drv_ampdu_action(local, sta->sdata, ¶ms); ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", sta->sta.addr, tid, ret); if (ret) { @@ -341,6 +376,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; tid_agg_rx->auto_seq = auto_seq; + tid_agg_rx->reorder_buf_filtered = 0; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ @@ -352,6 +388,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } end: + if (status == WLAN_STATUS_SUCCESS) + __set_bit(tid, sta->ampdu_mlme.agg_session_valid); mutex_unlock(&sta->ampdu_mlme.mtx); end_no_lock: diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ff757181b0a8..4932e9f243a2 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -295,7 +296,14 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, { struct ieee80211_local *local = sta->local; struct tid_ampdu_tx *tid_tx; - enum ieee80211_ampdu_mlme_action action; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .tid = tid, + .buf_size = 0, + .amsdu = false, + .timeout = 0, + .ssn = 0, + }; int ret; lockdep_assert_held(&sta->ampdu_mlme.mtx); @@ -304,10 +312,10 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, case AGG_STOP_DECLINED: case AGG_STOP_LOCAL_REQUEST: case AGG_STOP_PEER_REQUEST: - action = IEEE80211_AMPDU_TX_STOP_CONT; + params.action = IEEE80211_AMPDU_TX_STOP_CONT; break; case AGG_STOP_DESTROY_STA: - action = IEEE80211_AMPDU_TX_STOP_FLUSH; + params.action = IEEE80211_AMPDU_TX_STOP_FLUSH; break; default: WARN_ON_ONCE(1); @@ -330,9 +338,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, spin_unlock_bh(&sta->lock); if (reason != AGG_STOP_DESTROY_STA) return -EALREADY; - ret = drv_ampdu_action(local, sta->sdata, - IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, - &sta->sta, tid, NULL, 0, false); + params.action = IEEE80211_AMPDU_TX_STOP_FLUSH_CONT; + ret = drv_ampdu_action(local, sta->sdata, ¶ms); WARN_ON_ONCE(ret); return 0; } @@ -381,8 +388,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, WLAN_BACK_INITIATOR; tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST; - ret = drv_ampdu_action(local, sta->sdata, action, - &sta->sta, tid, NULL, 0, false); + ret = drv_ampdu_action(local, sta->sdata, ¶ms); /* HW shall not deny going back to legacy */ if (WARN_ON(ret)) { @@ -445,7 +451,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) struct tid_ampdu_tx *tid_tx; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - u16 start_seq_num; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_TX_START, + .tid = tid, + .buf_size = 0, + .amsdu = false, + .timeout = 0, + }; int ret; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -467,10 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ synchronize_net(); - start_seq_num = sta->tid_seq[tid] >> 4; - - ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START, - &sta->sta, tid, &start_seq_num, 0, false); + params.ssn = sta->tid_seq[tid] >> 4; + ret = drv_ampdu_action(local, sdata, ¶ms); if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", @@ -499,7 +510,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, start_seq_num, + tid_tx->dialog_token, params.ssn, IEEE80211_MAX_AMPDU_BUF, tid_tx->timeout); } @@ -684,18 +695,24 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, struct sta_info *sta, u16 tid) { struct tid_ampdu_tx *tid_tx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_TX_OPERATIONAL, + .tid = tid, + .timeout = 0, + .ssn = 0, + }; lockdep_assert_held(&sta->ampdu_mlme.mtx); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + params.buf_size = tid_tx->buf_size; + params.amsdu = tid_tx->amsdu; ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n", sta->sta.addr, tid); - drv_ampdu_action(local, sta->sdata, - IEEE80211_AMPDU_TX_OPERATIONAL, - &sta->sta, tid, NULL, tid_tx->buf_size, - tid_tx->amsdu); + drv_ampdu_action(local, sta->sdata, ¶ms); /* * synchronize with TX path, while splicing the TX path diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 166a29fe6c35..fe1704c4e8fb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -339,8 +339,9 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: - iv32 = key->u.tkip.tx.iv32; - iv16 = key->u.tkip.tx.iv16; + pn64 = atomic64_read(&key->conf.tx_pn); + iv32 = TKIP_PN_TO_IV32(pn64); + iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { @@ -1131,6 +1132,34 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.max_sp = params->max_sp; } + /* The sender might not have sent the last bit, consider it to be 0 */ + if (params->ext_capab_len >= 8) { + u8 val = (params->ext_capab[7] & + WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7; + + /* we did get all the bits, take the MSB as well */ + if (params->ext_capab_len >= 9) { + u8 val_msb = params->ext_capab[8] & + WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB; + val_msb <<= 1; + val |= val_msb; + } + + switch (val) { + case 1: + sta->sta.max_amsdu_subframes = 32; + break; + case 2: + sta->sta.max_amsdu_subframes = 16; + break; + case 3: + sta->sta.max_amsdu_subframes = 8; + break; + default: + sta->sta.max_amsdu_subframes = 0; + } + } + /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry @@ -1160,6 +1189,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, sta); + /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 1d1b9b7bdefe..283981108ca8 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -231,7 +231,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata) !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; - if (!sta->uploaded) + if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 3e24d0ddb51b..4ab5c522ceee 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -126,6 +126,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), + FLAG(SUPPORTS_REORDERING_BUFFER), #undef FLAG }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 7961e7d0b61e..a2ef95f16f11 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -132,9 +132,10 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: + pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", - key->u.tkip.tx.iv32, - key->u.tkip.tx.iv16); + TKIP_PN_TO_IV32(pn), + TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index ca1fe5576103..c258f1041d33 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -284,9 +284,7 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local, int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu) + struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; @@ -296,12 +294,10 @@ int drv_ampdu_action(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return -EIO; - trace_drv_ampdu_action(local, sdata, action, sta, tid, - ssn, buf_size, amsdu); + trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, - sta, tid, ssn, buf_size, amsdu); + ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 154ce4b13406..18b0d65baff0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -585,9 +585,7 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local) int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu); + struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 7a76ce639d58..f4a528773563 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -230,6 +230,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, /* set Rx highest rate */ ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest; + if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU) + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935; + else + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839; + apply: changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 978d3bc31df7..fc3238376b39 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -7,6 +7,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1050,9 +1051,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; - ieee80211_ht_oper_to_chandef(channel, - elems->ht_operation, - &chandef); + cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT); + ieee80211_chandef_ht_oper(elems->ht_operation, &chandef); memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -1066,9 +1066,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; - ieee80211_vht_oper_to_chandef(channel, - elems->vht_operation, - &chandef); + ieee80211_chandef_vht_oper(elems->vht_operation, + &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, &cap_ie, sta); @@ -1485,14 +1484,21 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) sdata_info(sdata, "Trigger new scan to find an IBSS to join\n"); - num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, - &ifibss->chandef, - channels, - ARRAY_SIZE(channels)); scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); - ieee80211_request_ibss_scan(sdata, ifibss->ssid, - ifibss->ssid_len, channels, num, - scan_width); + + if (ifibss->fixed_channel) { + num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, + &ifibss->chandef, + channels, + ARRAY_SIZE(channels)); + ieee80211_request_ibss_scan(sdata, ifibss->ssid, + ifibss->ssid_len, channels, + num, scan_width); + } else { + ieee80211_request_ibss_scan(sdata, ifibss->ssid, + ifibss->ssid_len, NULL, + 0, scan_width); + } } else { int interval = IEEE80211_SCAN_INTERVAL; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b84f6aa32c08..804575ff7af5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -92,7 +92,7 @@ struct ieee80211_fragment_entry { u16 extra_len; u16 last_frag; u8 rx_queue; - bool ccmp; /* Whether fragments were encrypted with CCMP */ + bool check_sequential_pn; /* needed for CCMP/GCMP */ u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ }; @@ -716,7 +716,6 @@ struct ieee80211_if_mesh { * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver - * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), @@ -724,7 +723,6 @@ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), - IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6), }; /** @@ -804,6 +802,7 @@ enum txq_info_flags { struct txq_info { struct sk_buff_head queue; unsigned long flags; + unsigned long byte_cnt; /* keep last! */ struct ieee80211_txq txq; @@ -1466,7 +1465,13 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START && status->flag & RX_FLAG_MACTIME_END); - return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END); + if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END)) + return true; + /* can't handle HT/VHT preamble yet */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START && + !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT))) + return true; + return false; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, @@ -1714,6 +1719,8 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); +void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum ieee80211_band band); @@ -1829,20 +1836,6 @@ static inline void ieee802_11_parse_elems(const u8 *start, size_t len, ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0); } -static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames) -{ - struct sk_buff *tail = skb_peek_tail(frames); - struct ieee80211_rx_status *status; - - if (!tail) - return false; - - status = IEEE80211_SKB_RXCB(tail); - if (status->flag & RX_FLAG_AMSDU_MORE) - return false; - - return true; -} extern const int ieee802_1d_to_ac[8]; @@ -1986,12 +1979,10 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ -void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_ht_operation *ht_oper, - struct cfg80211_chan_def *chandef); -void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_vht_operation *oper, - struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, + struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c9e325d2e120..453b4e741780 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -977,7 +977,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->vif.txq) { struct txq_info *txqi = to_txq_info(sdata->vif.txq); + spin_lock_bh(&txqi->queue.lock); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + txqi->byte_cnt = 0; + spin_unlock_bh(&txqi->queue.lock); + atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); } @@ -1271,6 +1275,16 @@ static void ieee80211_iface_work(struct work_struct *work) } } mutex_unlock(&local->sta_mtx); + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_VHT) { + switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_GROUPID_MGMT: + ieee80211_process_mu_groups(sdata, mgmt); + break; + default: + WARN_ON(1); + break; + } } else if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *)mgmt; /* diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 5e5bc599da4c..3df7b0392d30 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -932,50 +932,6 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify); -void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq) -{ - struct ieee80211_key *key; - u64 pn64; - - if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV))) - return; - - key = container_of(keyconf, struct ieee80211_key, conf); - - switch (key->conf.cipher) { - case WLAN_CIPHER_SUITE_TKIP: - seq->tkip.iv32 = key->u.tkip.tx.iv32; - seq->tkip.iv16 = key->u.tkip.tx.iv16; - break; - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_AES_CMAC: - case WLAN_CIPHER_SUITE_BIP_CMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_cmac)); - case WLAN_CIPHER_SUITE_BIP_GMAC_128: - case WLAN_CIPHER_SUITE_BIP_GMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_gmac)); - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), gcmp)); - pn64 = atomic64_read(&key->conf.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; - default: - WARN_ON(1); - } -} -EXPORT_SYMBOL(ieee80211_get_key_tx_seq); - void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { @@ -1029,48 +985,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, } EXPORT_SYMBOL(ieee80211_get_key_rx_seq); -void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq) -{ - struct ieee80211_key *key; - u64 pn64; - - key = container_of(keyconf, struct ieee80211_key, conf); - - switch (key->conf.cipher) { - case WLAN_CIPHER_SUITE_TKIP: - key->u.tkip.tx.iv32 = seq->tkip.iv32; - key->u.tkip.tx.iv16 = seq->tkip.iv16; - break; - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_AES_CMAC: - case WLAN_CIPHER_SUITE_BIP_CMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_cmac)); - case WLAN_CIPHER_SUITE_BIP_GMAC_128: - case WLAN_CIPHER_SUITE_BIP_GMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_gmac)); - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), gcmp)); - pn64 = (u64)seq->ccmp.pn[5] | - ((u64)seq->ccmp.pn[4] << 8) | - ((u64)seq->ccmp.pn[3] << 16) | - ((u64)seq->ccmp.pn[2] << 24) | - ((u64)seq->ccmp.pn[1] << 32) | - ((u64)seq->ccmp.pn[0] << 40); - atomic64_set(&key->conf.tx_pn, pn64); - break; - default: - WARN_ON(1); - break; - } -} -EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq); - void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 9951ef06323e..4aa20cef0859 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -44,13 +44,17 @@ enum ieee80211_internal_tkip_state { }; struct tkip_ctx { - u32 iv32; /* current iv32 */ - u16 iv16; /* current iv16 */ u16 p1k[5]; /* p1k cache */ u32 p1k_iv32; /* iv32 for which p1k computed */ enum ieee80211_internal_tkip_state state; }; +struct tkip_ctx_rx { + struct tkip_ctx ctx; + u32 iv32; /* current iv32 */ + u16 iv16; /* current iv16 */ +}; + struct ieee80211_key { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -71,7 +75,7 @@ struct ieee80211_key { struct tkip_ctx tx; /* last received RSC */ - struct tkip_ctx rx[IEEE80211_NUM_TIDS]; + struct tkip_ctx_rx rx[IEEE80211_NUM_TIDS]; /* number of mic failures */ u32 mic_failures; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6f85b6ab8e51..d32cefcb63b0 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -91,11 +91,10 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, if (sdata->vif.bss_conf.basic_rates != basic_rates) return false; - ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, - ie->ht_operation, &sta_chan_def); - - ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, - ie->vht_operation, &sta_chan_def); + cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, + NL80211_CHAN_NO_HT); + ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); + ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 4a8019f79fb2..87c017a3b1ce 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -137,8 +137,6 @@ struct mesh_path { * @copy_node: function to copy nodes of the table * @size_order: determines size of the table, there will be 2^size_order hash * buckets - * @mean_chain_len: maximum average length for the hash buckets' list, if it is - * reached, the table will grow * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. * @@ -154,7 +152,6 @@ struct mesh_table { void (*free_node) (struct hlist_node *p, bool free_leafs); int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl); int size_order; - int mean_chain_len; struct hlist_head *known_gates; spinlock_t gates_lock; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index c6be0b4f4058..5b6aec1a0630 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -205,9 +205,9 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - skb_set_mac_header(skb, 0); - skb_set_network_header(skb, 0); - skb_set_transport_header(skb, 0); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ skb_set_queue_mapping(skb, IEEE80211_AC_VO); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index dadf8dc6f1cf..2ba7aa56b11c 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -55,16 +55,21 @@ int mpp_paths_generation; static DEFINE_RWLOCK(pathtbl_resize_lock); +static inline struct mesh_table *resize_dereference_paths( + struct mesh_table __rcu *table) +{ + return rcu_dereference_protected(table, + lockdep_is_held(&pathtbl_resize_lock)); +} + static inline struct mesh_table *resize_dereference_mesh_paths(void) { - return rcu_dereference_protected(mesh_paths, - lockdep_is_held(&pathtbl_resize_lock)); + return resize_dereference_paths(mesh_paths); } static inline struct mesh_table *resize_dereference_mpp_paths(void) { - return rcu_dereference_protected(mpp_paths, - lockdep_is_held(&pathtbl_resize_lock)); + return resize_dereference_paths(mpp_paths); } /* @@ -160,11 +165,10 @@ static int mesh_table_grow(struct mesh_table *oldtbl, int i; if (atomic_read(&oldtbl->entries) - < oldtbl->mean_chain_len * (oldtbl->hash_mask + 1)) + < MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1)) return -EAGAIN; newtbl->free_node = oldtbl->free_node; - newtbl->mean_chain_len = oldtbl->mean_chain_len; newtbl->copy_node = oldtbl->copy_node; newtbl->known_gates = oldtbl->known_gates; atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries)); @@ -585,7 +589,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, hlist_add_head_rcu(&new_node->list, bucket); if (atomic_inc_return(&tbl->entries) >= - tbl->mean_chain_len * (tbl->hash_mask + 1)) + MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) grow = 1; mesh_paths_generation++; @@ -714,7 +718,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, hlist_add_head_rcu(&new_node->list, bucket); if (atomic_inc_return(&tbl->entries) >= - tbl->mean_chain_len * (tbl->hash_mask + 1)) + MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) grow = 1; spin_unlock(&tbl->hashwlock[hash_idx]); @@ -835,6 +839,29 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) rcu_read_unlock(); } +static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, + const u8 *proxy) +{ + struct mesh_table *tbl; + struct mesh_path *mpp; + struct mpath_node *node; + int i; + + rcu_read_lock(); + read_lock_bh(&pathtbl_resize_lock); + tbl = resize_dereference_mpp_paths(); + for_each_mesh_entry(tbl, node, i) { + mpp = node->mpath; + if (ether_addr_equal(mpp->mpp, proxy)) { + spin_lock(&tbl->hashwlock[i]); + __mesh_path_del(tbl, node); + spin_unlock(&tbl->hashwlock[i]); + } + } + read_unlock_bh(&pathtbl_resize_lock); + rcu_read_unlock(); +} + static void table_flush_by_iface(struct mesh_table *tbl, struct ieee80211_sub_if_data *sdata) { @@ -876,14 +903,17 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) } /** - * mesh_path_del - delete a mesh path from the table + * table_path_del - delete a path from the mesh or mpp table * - * @addr: dst address (ETH_ALEN length) + * @tbl: mesh or mpp path table * @sdata: local subif + * @addr: dst address (ETH_ALEN length) * * Returns: 0 if successful */ -int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +static int table_path_del(struct mesh_table __rcu *rcu_tbl, + struct ieee80211_sub_if_data *sdata, + const u8 *addr) { struct mesh_table *tbl; struct mesh_path *mpath; @@ -892,8 +922,7 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) int hash_idx; int err = 0; - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mesh_paths(); + tbl = resize_dereference_paths(rcu_tbl); hash_idx = mesh_table_hash(addr, sdata, tbl); bucket = &tbl->hash_buckets[hash_idx]; @@ -909,9 +938,50 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) err = -ENXIO; enddel: - mesh_paths_generation++; spin_unlock(&tbl->hashwlock[hash_idx]); + return err; +} + +/** + * mesh_path_del - delete a mesh path from the table + * + * @addr: dst address (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 if successful + */ +int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +{ + int err = 0; + + /* flush relevant mpp entries first */ + mpp_flush_by_proxy(sdata, addr); + + read_lock_bh(&pathtbl_resize_lock); + err = table_path_del(mesh_paths, sdata, addr); + mesh_paths_generation++; read_unlock_bh(&pathtbl_resize_lock); + + return err; +} + +/** + * mpp_path_del - delete a mesh proxy path from the table + * + * @addr: addr address (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 if successful + */ +static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +{ + int err = 0; + + read_lock_bh(&pathtbl_resize_lock); + err = table_path_del(mpp_paths, sdata, addr); + mpp_paths_generation++; + read_unlock_bh(&pathtbl_resize_lock); + return err; } @@ -1076,7 +1146,6 @@ int mesh_pathtbl_init(void) return -ENOMEM; tbl_path->free_node = &mesh_path_node_free; tbl_path->copy_node = &mesh_path_node_copy; - tbl_path->mean_chain_len = MEAN_CHAIN_LEN; tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); if (!tbl_path->known_gates) { ret = -ENOMEM; @@ -1092,7 +1161,6 @@ int mesh_pathtbl_init(void) } tbl_mpp->free_node = &mesh_path_node_free; tbl_mpp->copy_node = &mesh_path_node_copy; - tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN; tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); if (!tbl_mpp->known_gates) { ret = -ENOMEM; @@ -1131,6 +1199,17 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata) time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) mesh_path_del(mpath->sdata, mpath->dst); } + + tbl = rcu_dereference(mpp_paths); + for_each_mesh_entry(tbl, node, i) { + if (node->mpath->sdata != sdata) + continue; + mpath = node->mpath; + if ((!(mpath->flags & MESH_PATH_FIXED)) && + time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) + mpp_path_del(mpath->sdata, mpath->dst); + } + rcu_read_unlock(); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index bd3d55eb21d4..a07e93c21c9e 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -976,6 +976,10 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink error: no more free plinks\n"); goto out; } + + /* new matching peer */ + event = OPN_ACPT; + goto out; } else { if (!test_sta_flag(sta, WLAN_STA_AUTH)) { mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n"); @@ -985,12 +989,6 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, goto out; } - /* new matching peer */ - if (!sta) { - event = OPN_ACPT; - goto out; - } - switch (ftype) { case WLAN_SP_MESH_PEERING_OPEN: if (!matches_local) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bfbb1acafdd1..281b8d6e5109 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6,7 +6,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -196,16 +196,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, /* check 40 MHz support, if we have it */ if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) { - switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { - case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 += 10; - break; - case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 -= 10; - break; - } + ieee80211_chandef_ht_oper(ht_oper, chandef); } else { /* 40 MHz (and 80 MHz) must be supported for VHT */ ret = IEEE80211_STA_DISABLE_VHT; @@ -219,35 +210,11 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, goto out; } - vht_chandef.chan = channel; - vht_chandef.center_freq1 = - ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx, - channel->band); - vht_chandef.center_freq2 = 0; - - switch (vht_oper->chan_width) { - case IEEE80211_VHT_CHANWIDTH_USE_HT: - vht_chandef.width = chandef->width; - vht_chandef.center_freq1 = chandef->center_freq1; - break; - case IEEE80211_VHT_CHANWIDTH_80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80; - break; - case IEEE80211_VHT_CHANWIDTH_160MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_160; - break; - case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80P80; - vht_chandef.center_freq2 = - ieee80211_channel_to_frequency( - vht_oper->center_freq_seg2_idx, - channel->band); - break; - default: + vht_chandef = *chandef; + if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, - "AP VHT operation IE has invalid channel width (%d), disable VHT\n", - vht_oper->chan_width); + "AP VHT information is invalid, disable VHT\n"); ret = IEEE80211_STA_DISABLE_VHT; goto out; } @@ -592,7 +559,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data *other; list_for_each_entry_rcu(other, &local->interfaces, list) { - if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) { + if (other->vif.mu_mimo_owner) { disable_mu_mimo = true; break; } @@ -600,7 +567,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, if (disable_mu_mimo) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; else - sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER; + sdata->vif.mu_mimo_owner = true; } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; @@ -1638,8 +1605,7 @@ void ieee80211_dynamic_ps_timer(unsigned long data) void ieee80211_dfs_cac_timer_work(struct work_struct *work) { - struct delayed_work *delayed_work = - container_of(work, struct delayed_work, work); + struct delayed_work *delayed_work = to_delayed_work(work); struct ieee80211_sub_if_data *sdata = container_of(delayed_work, struct ieee80211_sub_if_data, dfs_cac_timer_work); @@ -2079,7 +2045,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); - sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; + + /* reset MU-MIMO ownership and group data */ + memset(sdata->vif.bss_conf.mu_group.membership, 0, + sizeof(sdata->vif.bss_conf.mu_group.membership)); + memset(sdata->vif.bss_conf.mu_group.position, 0, + sizeof(sdata->vif.bss_conf.mu_group.position)); + changed |= BSS_CHANGED_MU_GROUPS; + sdata->vif.mu_mimo_owner = false; sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -2536,7 +2509,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, eth_zero_addr(sdata->u.mgd.bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; - sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; + sdata->vif.mu_mimo_owner = false; + mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->mtx); @@ -3571,6 +3545,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); + sdata_info(sdata, + "failed to follow AP %pM bandwidth change, disconnect\n", + bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); @@ -3946,11 +3923,9 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) * We actually lost the connection ... or did we? * Let's make sure! */ - wiphy_debug(local->hw.wiphy, - "%s: No probe response from AP %pM" - " after %dms, disconnecting.\n", - sdata->name, - bssid, probe_wait_ms); + mlme_dbg(sdata, + "No probe response from AP %pM after %dms, disconnecting.\n", + bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, bssid, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); @@ -4536,6 +4511,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + sdata_info(sdata, + "disconnect from AP %pM for new auth to %pM\n", + ifmgd->associated->bssid, req->bss->bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); @@ -4604,6 +4582,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + sdata_info(sdata, + "disconnect from AP %pM for new assoc to %pM\n", + ifmgd->associated->bssid, req->bss->bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 3ece7d1034c8..b54f398cda5d 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma); + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 3928dbd24e25..370d677b547b 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) (max_tp_group != MINSTREL_CCK_GROUP)) return; + max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; + max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_ewma); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; - max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; - max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, max_gpr_prob); @@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) } else { if (mrs->prob_ewma > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma) + if (mrs->prob_ewma > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) if (likely(sta->ampdu_mlme.tid_tx[tid])) return; - ieee80211_start_tx_ba_session(pubsta, tid, 5000); + ieee80211_start_tx_ba_session(pubsta, tid, 0); } static void @@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, * - if station is in dynamic SMPS (and streams > 1) * - for fallback rates, to increase chances of getting through */ - if (offset > 0 && + if (offset > 0 || (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC && group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; @@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) prob = mi->groups[i].rates[j].prob_ewma; /* convert tp_avg from pkt per second in kbps */ - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024; + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; + tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bc081850ac0e..dc27becb9b71 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +19,7 @@ #include <linux/etherdevice.h> #include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/bitops.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include <asm/unaligned.h> @@ -122,7 +124,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, hdr = (void *)(skb->data + rtap_vendor_space); if (status->flag & (RX_FLAG_FAILED_FCS_CRC | - RX_FLAG_FAILED_PLCP_CRC)) + RX_FLAG_FAILED_PLCP_CRC | + RX_FLAG_ONLY_MONITOR)) return true; if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) @@ -507,7 +510,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - if (!local->monitors) { + if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) { if (should_drop_frame(origskb, present_fcs_len, rtap_vendor_space)) { dev_kfree_skb(origskb); @@ -797,6 +800,26 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx, + int index) +{ + struct sk_buff_head *frames = &tid_agg_rx->reorder_buf[index]; + struct sk_buff *tail = skb_peek_tail(frames); + struct ieee80211_rx_status *status; + + if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index)) + return true; + + if (!tail) + return false; + + status = IEEE80211_SKB_RXCB(tail); + if (status->flag & RX_FLAG_AMSDU_MORE) + return false; + + return true; +} + static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, int index, @@ -811,7 +834,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, if (skb_queue_empty(skb_list)) goto no_frame; - if (!ieee80211_rx_reorder_ready(skb_list)) { + if (!ieee80211_rx_reorder_ready(tid_agg_rx, index)) { __skb_queue_purge(skb_list); goto no_frame; } @@ -825,6 +848,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, } no_frame: + tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num); } @@ -865,7 +889,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, /* release the buffer until next missing frame */ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; - if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) && + if (!ieee80211_rx_reorder_ready(tid_agg_rx, index) && tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any @@ -874,8 +898,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, int skipped = 1; for (j = (index + 1) % tid_agg_rx->buf_size; j != index; j = (j + 1) % tid_agg_rx->buf_size) { - if (!ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[j])) { + if (!ieee80211_rx_reorder_ready(tid_agg_rx, j)) { skipped++; continue; } @@ -902,8 +925,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, skipped) & IEEE80211_SN_MASK; skipped = 0; } - } else while (ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[index])) { + } else while (ieee80211_rx_reorder_ready(tid_agg_rx, index)) { ieee80211_release_reorder_frame(sdata, tid_agg_rx, index, frames); index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; @@ -914,8 +936,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, for (; j != (index - 1) % tid_agg_rx->buf_size; j = (j + 1) % tid_agg_rx->buf_size) { - if (ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[j])) + if (ieee80211_rx_reorder_ready(tid_agg_rx, j)) break; } @@ -986,7 +1007,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata index = mpdu_seq_num % tid_agg_rx->buf_size; /* check if we already stored this frame */ - if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) { + if (ieee80211_rx_reorder_ready(tid_agg_rx, index)) { dev_kfree_skb(skb); goto out; } @@ -1099,6 +1120,9 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + if (status->flag & RX_FLAG_DUP_VALIDATED) + return RX_CONTINUE; + /* * Drop duplicate 802.11 retransmissions * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery") @@ -1753,7 +1777,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, entry->seq = seq; entry->rx_queue = rx_queue; entry->last_frag = frag; - entry->ccmp = 0; + entry->check_sequential_pn = false; entry->extra_len = 0; return entry; @@ -1849,15 +1873,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, &(rx->skb)); if (rx->key && (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) && + rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) && ieee80211_has_protected(fc)) { int queue = rx->security_idx; - /* Store CCMP PN so that we can verify that the next - * fragment has a sequential PN value. */ - entry->ccmp = 1; + + /* Store CCMP/GCMP PN so that we can verify that the + * next fragment has a sequential PN value. + */ + entry->check_sequential_pn = true; memcpy(entry->last_pn, rx->key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); + BUILD_BUG_ON(offsetof(struct ieee80211_key, + u.ccmp.rx_pn) != + offsetof(struct ieee80211_key, + u.gcmp.rx_pn)); + BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) != + sizeof(rx->key->u.gcmp.rx_pn[queue])); + BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN != + IEEE80211_GCMP_PN_LEN); } return RX_QUEUED; } @@ -1872,15 +1908,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; } - /* Verify that MPDUs within one MSDU have sequential PN values. - * (IEEE 802.11i, 8.3.3.4.5) */ - if (entry->ccmp) { + /* "The receiver shall discard MSDUs and MMPDUs whose constituent + * MPDU PN values are not incrementing in steps of 1." + * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP) + * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP) + */ + if (entry->check_sequential_pn) { int i; u8 pn[IEEE80211_CCMP_PN_LEN], *rpn; int queue; + if (!rx->key || (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256)) + rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256)) return RX_DROP_UNUSABLE; memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN); for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) { @@ -2199,9 +2241,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) skb->dev = dev; __skb_queue_head_init(&frame_list); - if (skb_linearize(skb)) - return RX_DROP_UNUSABLE; - ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, rx->local->hw.extra_tx_headroom, true); @@ -2231,7 +2270,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u16 q, hdrlen; + u16 ac, q, hdrlen; hdr = (struct ieee80211_hdr *) skb->data; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -2290,6 +2329,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) spin_lock_bh(&mppath->state_lock); if (!ether_addr_equal(mppath->mpp, mpp_addr)) memcpy(mppath->mpp, mpp_addr, ETH_ALEN); + mppath->exp_time = jiffies; spin_unlock_bh(&mppath->state_lock); } rcu_read_unlock(); @@ -2300,7 +2340,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) ether_addr_equal(sdata->vif.addr, hdr->addr3)) return RX_CONTINUE; - q = ieee80211_select_queue_80211(sdata, skb, hdr); + ac = ieee80211_select_queue_80211(sdata, skb, hdr); + q = sdata->vif.hw_queue[ac]; if (ieee80211_queue_stopped(&local->hw, q)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); return RX_DROP_MONITOR; @@ -2738,6 +2779,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) opmode, status->band); goto handled; } + case WLAN_VHT_ACTION_GROUPID_MGMT: { + if (len < IEEE80211_MIN_ACTION_SIZE + 25) + goto invalid; + goto queue; + } default: break; } @@ -3073,7 +3119,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, false); - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); @@ -3275,6 +3321,85 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) ieee80211_rx_handlers(&rx, &frames); } +void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, + u16 ssn, u64 filtered, + u16 received_mpdus) +{ + struct sta_info *sta; + struct tid_ampdu_rx *tid_agg_rx; + struct sk_buff_head frames; + struct ieee80211_rx_data rx = { + /* This is OK -- must be QoS data frame */ + .security_idx = tid, + .seqno_idx = tid, + }; + int i, diff; + + if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS)) + return; + + __skb_queue_head_init(&frames); + + sta = container_of(pubsta, struct sta_info, sta); + + rx.sta = sta; + rx.sdata = sta->sdata; + rx.local = sta->local; + + rcu_read_lock(); + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + goto out; + + spin_lock_bh(&tid_agg_rx->reorder_lock); + + if (received_mpdus >= IEEE80211_SN_MODULO >> 1) { + int release; + + /* release all frames in the reorder buffer */ + release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) % + IEEE80211_SN_MODULO; + ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, + release, &frames); + /* update ssn to match received ssn */ + tid_agg_rx->head_seq_num = ssn; + } else { + ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn, + &frames); + } + + /* handle the case that received ssn is behind the mac ssn. + * it can be tid_agg_rx->buf_size behind and still be valid */ + diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK; + if (diff >= tid_agg_rx->buf_size) { + tid_agg_rx->reorder_buf_filtered = 0; + goto release; + } + filtered = filtered >> diff; + ssn += diff; + + /* update bitmap */ + for (i = 0; i < tid_agg_rx->buf_size; i++) { + int index = (ssn + i) % tid_agg_rx->buf_size; + + tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); + if (filtered & BIT_ULL(i)) + tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index); + } + + /* now process also frames that the filter marking released */ + ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); + +release: + spin_unlock_bh(&tid_agg_rx->reorder_lock); + + ieee80211_rx_handlers(&rx, &frames); + + out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames); + /* main receive path */ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) @@ -3366,6 +3491,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; /* ignore action frames to TDLS-peers */ if (ieee80211_is_action(hdr->frame_control) && + !is_broadcast_ether_addr(bssid) && !ether_addr_equal(bssid, hdr->addr1)) return false; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a4a4f89d3ba0..d20bab5c146c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -116,6 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta) ieee80211_purge_tx_queue(&local->hw, &txqi->queue); atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); + txqi->byte_cnt = 0; } } @@ -498,11 +499,17 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - struct station_info sinfo; + struct station_info *sinfo; int err = 0; lockdep_assert_held(&local->sta_mtx); + sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); + if (!sinfo) { + err = -ENOMEM; + goto out_err; + } + /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; @@ -530,14 +537,12 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_recalc_min_chandef(sdata); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); - memset(&sinfo, 0, sizeof(sinfo)); - sinfo.filled = 0; - sinfo.generation = local->sta_generation; - cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); + sinfo->generation = local->sta_generation; + cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); + kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); @@ -557,6 +562,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) __cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); + kfree(sinfo); rcu_read_lock(); return err; } @@ -898,7 +904,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - struct station_info sinfo = {}; + struct station_info *sinfo; int ret; /* @@ -936,12 +942,14 @@ static void __sta_info_destroy_part2(struct sta_info *sta) sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); - sta_set_sinfo(sta, &sinfo); - cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); + sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); + if (sinfo) + sta_set_sinfo(sta, sinfo); + cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); + kfree(sinfo); rate_control_remove_sta_debugfs(sta); ieee80211_sta_debugfs_remove(sta); - ieee80211_recalc_min_chandef(sdata); cleanup_single_sta(sta); } @@ -1808,14 +1816,17 @@ int sta_info_move_state(struct sta_info *sta, clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: - if (sta->sta_state == IEEE80211_STA_NONE) + if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); - else if (sta->sta_state == IEEE80211_STA_ASSOC) + } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); + ieee80211_recalc_min_chandef(sta->sdata); + } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); + ieee80211_recalc_min_chandef(sta->sdata); } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d6051629ed15..053f5c4fa495 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,6 +1,7 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -167,6 +168,8 @@ struct tid_ampdu_tx { * * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an * A-MSDU with individually reported subframes. + * @reorder_buf_filtered: bitmap indicating where there are filtered frames in + * the reorder buffer that should be ignored when releasing frames * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. @@ -194,6 +197,7 @@ struct tid_ampdu_tx { struct tid_ampdu_rx { struct rcu_head rcu_head; spinlock_t reorder_lock; + u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; struct timer_list session_timer; @@ -212,20 +216,21 @@ struct tid_ampdu_rx { /** * struct sta_ampdu_mlme - STA aggregation information. * + * @mtx: mutex to protect all TX data (except non-NULL assignments + * to tid_tx[idx], which are protected by the sta spinlock) + * tid_start_tx is also protected by sta->lock. * @tid_rx: aggregation info for Rx per TID -- RCU protected - * @tid_tx: aggregation info for Tx per TID - * @tid_start_tx: sessions where start was requested - * @addba_req_num: number of times addBA request has been sent. - * @last_addba_req_time: timestamp of the last addBA request. - * @dialog_token_allocator: dialog token enumerator for each new session; - * @work: work struct for starting/stopping aggregation * @tid_rx_timer_expired: bitmap indicating on which TIDs the * RX timer expired until the work for it runs * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the * driver requested to close until the work for it runs - * @mtx: mutex to protect all TX data (except non-NULL assignments - * to tid_tx[idx], which are protected by the sta spinlock) - * tid_start_tx is also protected by sta->lock. + * @agg_session_valid: bitmap indicating which TID has a rx BA session open on + * @work: work struct for starting/stopping aggregation + * @tid_tx: aggregation info for Tx per TID + * @tid_start_tx: sessions where start was requested + * @last_addba_req_time: timestamp of the last addBA request. + * @addba_req_num: number of times addBA request has been sent. + * @dialog_token_allocator: dialog token enumerator for each new session; */ struct sta_ampdu_mlme { struct mutex mtx; @@ -233,6 +238,7 @@ struct sta_ampdu_mlme { struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS]; unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; + unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; /* tx */ struct work_struct work; struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 6101deb805a8..8b1b2ea03eb5 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -697,7 +697,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, rtap_len, shift); /* XXX: is this sufficient for BPF? */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 0ae207771a58..b3622823bad2 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. + * Copyright (C) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -142,15 +143,14 @@ static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx, /* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets * of the IV. Returns pointer to the octet following IVs (i.e., beginning of * the packet payload). */ -u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key) +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn) { - lockdep_assert_held(&key->u.tkip.txlock); - - pos = write_tkip_iv(pos, key->u.tkip.tx.iv16); - *pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */; - put_unaligned_le32(key->u.tkip.tx.iv32, pos); + pos = write_tkip_iv(pos, TKIP_PN_TO_IV16(pn)); + *pos++ = (keyconf->keyidx << 6) | (1 << 5) /* Ext IV */; + put_unaligned_le32(TKIP_PN_TO_IV32(pn), pos); return pos + 4; } +EXPORT_SYMBOL_GPL(ieee80211_tkip_add_iv); static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32) { @@ -250,6 +250,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, u8 rc4key[16], keyid, *pos = payload; int res; const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; + struct tkip_ctx_rx *rx_ctx = &key->u.tkip.rx[queue]; if (payload_len < 12) return -1; @@ -265,37 +266,36 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, if ((keyid >> 6) != key->conf.keyidx) return TKIP_DECRYPT_INVALID_KEYIDX; - if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT && - (iv32 < key->u.tkip.rx[queue].iv32 || - (iv32 == key->u.tkip.rx[queue].iv32 && - iv16 <= key->u.tkip.rx[queue].iv16))) + if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT && + (iv32 < rx_ctx->iv32 || + (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16))) return TKIP_DECRYPT_REPLAY; if (only_iv) { res = TKIP_DECRYPT_OK; - key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED; goto done; } - if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT || - key->u.tkip.rx[queue].iv32 != iv32) { + if (rx_ctx->ctx.state == TKIP_STATE_NOT_INIT || + rx_ctx->iv32 != iv32) { /* IV16 wrapped around - perform TKIP phase 1 */ - tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32); + tkip_mixing_phase1(tk, &rx_ctx->ctx, ta, iv32); } if (key->local->ops->update_tkip_key && key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && - key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) { + rx_ctx->ctx.state != TKIP_STATE_PHASE1_HW_UPLOADED) { struct ieee80211_sub_if_data *sdata = key->sdata; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(key->sdata->bss, struct ieee80211_sub_if_data, u.ap); drv_update_tkip_key(key->local, sdata, &key->conf, key->sta, - iv32, key->u.tkip.rx[queue].p1k); - key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + iv32, rx_ctx->ctx.p1k); + rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED; } - tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key); + tkip_mixing_phase2(tk, &rx_ctx->ctx, iv16, rc4key); res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12); done: diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h index e3ecb659b90a..a1bcbfbefe7c 100644 --- a/net/mac80211/tkip.h +++ b/net/mac80211/tkip.h @@ -13,8 +13,6 @@ #include <linux/crypto.h> #include "key.h" -u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key); - int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm, struct ieee80211_key *key, struct sk_buff *skb, diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index a6b4442776a0..2b0a17ee907a 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -80,7 +80,23 @@ #define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" #define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx - +#define AMPDU_ACTION_ENTRY __field(enum ieee80211_ampdu_mlme_action, \ + ieee80211_ampdu_mlme_action) \ + STA_ENTRY \ + __field(u16, tid) \ + __field(u16, ssn) \ + __field(u8, buf_size) \ + __field(bool, amsdu) \ + __field(u16, timeout) +#define AMPDU_ACTION_ASSIGN STA_NAMED_ASSIGN(params->sta); \ + __entry->tid = params->tid; \ + __entry->ssn = params->ssn; \ + __entry->buf_size = params->buf_size; \ + __entry->amsdu = params->amsdu; \ + __entry->timeout = params->timeout; +#define AMPDU_ACTION_PR_FMT STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d" +#define AMPDU_ACTION_PR_ARG STA_PR_ARG, __entry->tid, __entry->ssn, \ + __entry->buf_size, __entry->amsdu, __entry->timeout /* * Tracing for driver callbacks. @@ -970,38 +986,25 @@ DEFINE_EVENT(local_only_evt, drv_tx_last_beacon, TRACE_EVENT(drv_ampdu_action, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu), + struct ieee80211_ampdu_params *params), - TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu), + TP_ARGS(local, sdata, params), TP_STRUCT__entry( LOCAL_ENTRY - STA_ENTRY - __field(u32, action) - __field(u16, tid) - __field(u16, ssn) - __field(u8, buf_size) - __field(bool, amsdu) VIF_ENTRY + AMPDU_ACTION_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - STA_ASSIGN; - __entry->action = action; - __entry->tid = tid; - __entry->ssn = ssn ? *ssn : 0; - __entry->buf_size = buf_size; - __entry->amsdu = amsdu; + AMPDU_ACTION_ASSIGN; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d", - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, - __entry->tid, __entry->buf_size, __entry->amsdu + LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3311ce0f3d6c..62ad5321257d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -710,6 +710,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) info->control.short_preamble = txrc.short_preamble; + /* don't ask rate control when rate already injected via radiotap */ + if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT) + return TX_CONTINUE; + if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); @@ -1266,7 +1270,11 @@ static void ieee80211_drv_tx(struct ieee80211_local *local, if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) netif_stop_subqueue(sdata->dev, ac); - skb_queue_tail(&txqi->queue, skb); + spin_lock_bh(&txqi->queue.lock); + txqi->byte_cnt += skb->len; + __skb_queue_tail(&txqi->queue, skb); + spin_unlock_bh(&txqi->queue.lock); + drv_wake_tx_queue(local, txqi); return; @@ -1294,6 +1302,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + txqi->byte_cnt -= skb->len; + atomic_dec(&sdata->txqs_len[ac]); if (__netif_subqueue_stopped(sdata->dev, ac)) ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); @@ -1665,15 +1675,24 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, ieee80211_tx(sdata, sta, skb, false); } -static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) +static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, + struct sk_buff *skb) { struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[info->band]; int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; + u16 rate = 0; + bool rate_found = false; + u8 rate_retries = 0; + u16 rate_flags = 0; + u8 mcs_known, mcs_flags; + int i; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; @@ -1724,6 +1743,35 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) info->flags |= IEEE80211_TX_CTL_NO_ACK; break; + case IEEE80211_RADIOTAP_RATE: + rate = *iterator.this_arg; + rate_flags = 0; + rate_found = true; + break; + + case IEEE80211_RADIOTAP_DATA_RETRIES: + rate_retries = *iterator.this_arg; + break; + + case IEEE80211_RADIOTAP_MCS: + mcs_known = iterator.this_arg[0]; + mcs_flags = iterator.this_arg[1]; + if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS)) + break; + + rate_found = true; + rate = iterator.this_arg[2]; + rate_flags = IEEE80211_TX_RC_MCS; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI && + mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) + rate_flags |= IEEE80211_TX_RC_SHORT_GI; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && + mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40) + rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + break; + /* * Please update the file * Documentation/networking/mac80211-injection.txt @@ -1738,6 +1786,32 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ return false; + if (rate_found) { + info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + info->control.rates[i].idx = -1; + info->control.rates[i].flags = 0; + info->control.rates[i].count = 0; + } + + if (rate_flags & IEEE80211_TX_RC_MCS) { + info->control.rates[0].idx = rate; + } else { + for (i = 0; i < sband->n_bitrates; i++) { + if (rate * 5 != sband->bitrates[i].bitrate) + continue; + + info->control.rates[0].idx = i; + break; + } + } + + info->control.rates[0].flags = rate_flags; + info->control.rates[0].count = min_t(u8, rate_retries + 1, + local->hw.max_rate_tries); + } + /* * remove the radiotap header * iterator->_max_length was sanity-checked against @@ -1818,10 +1892,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; - /* process and remove the injection radiotap header */ - if (!ieee80211_parse_tx_radiotap(skb)) - goto fail; - rcu_read_lock(); /* @@ -1883,6 +1953,11 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, goto fail_rcu; info->band = chandef->chan->band; + + /* process and remove the injection radiotap header */ + if (!ieee80211_parse_tx_radiotap(local, skb)) + goto fail_rcu; + ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); @@ -2099,8 +2174,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, mpp_lookup = true; } - if (mpp_lookup) + if (mpp_lookup) { mppath = mpp_path_lookup(sdata, skb->data); + if (mppath) + mppath->exp_time = jiffies; + } if (mppath && mpath) mesh_path_del(mpath->sdata, mpath->dst); @@ -2380,7 +2458,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, /* Update skb pointers to various headers since this modified frame * is going to go through Linux networking code that may potentially * need things like pointer to IP header. */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, nh_pos); skb_set_transport_header(skb, h_pos); @@ -3895,9 +3973,9 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, { int ac = ieee802_1d_to_ac[tid & 7]; - skb_set_mac_header(skb, 0); - skb_set_network_header(skb, 0); - skb_set_transport_header(skb, 0); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); skb_set_queue_mapping(skb, ac); skb->priority = tid; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 58f58bd5202f..7390de4946a9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015-2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1928,6 +1928,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) BSS_CHANGED_IDLE | BSS_CHANGED_TXPOWER; + if (sdata->vif.mu_mimo_owner) + changed |= BSS_CHANGED_MU_GROUPS; + switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: changed |= BSS_CHANGED_ASSOC | @@ -2371,10 +2374,23 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, switch (chandef->width) { case NL80211_CHAN_WIDTH_160: - vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ; + /* + * Convert 160 MHz channel width to new style as interop + * workaround. + */ + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx; + if (chandef->chan->center_freq < chandef->center_freq1) + vht_oper->center_freq_seg1_idx -= 8; + else + vht_oper->center_freq_seg1_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: - vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ; + /* + * Convert 80+80 MHz channel width to new style as interop + * workaround. + */ + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; @@ -2390,17 +2406,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_ht_operation *ht_oper, - struct cfg80211_chan_def *chandef) +bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, + struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; - if (!ht_oper) { - cfg80211_chandef_create(chandef, control_chan, - NL80211_CHAN_NO_HT); - return; - } + if (!ht_oper) + return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: @@ -2414,42 +2426,66 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, break; default: channel_type = NL80211_CHAN_NO_HT; + return false; } - cfg80211_chandef_create(chandef, control_chan, channel_type); + cfg80211_chandef_create(chandef, chandef->chan, channel_type); + return true; } -void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_vht_operation *oper, - struct cfg80211_chan_def *chandef) +bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef) { + struct cfg80211_chan_def new = *chandef; + int cf1, cf2; + if (!oper) - return; + return false; - chandef->chan = control_chan; + cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, + chandef->chan->band); + cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, + chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: break; case IEEE80211_VHT_CHANWIDTH_80MHZ: - chandef->width = NL80211_CHAN_WIDTH_80; + new.width = NL80211_CHAN_WIDTH_80; + new.center_freq1 = cf1; + /* If needed, adjust based on the newer interop workaround. */ + if (oper->center_freq_seg2_idx) { + unsigned int diff; + + diff = abs(oper->center_freq_seg2_idx - + oper->center_freq_seg1_idx); + if (diff == 8) { + new.width = NL80211_CHAN_WIDTH_160; + new.center_freq1 = cf2; + } else if (diff > 8) { + new.width = NL80211_CHAN_WIDTH_80P80; + new.center_freq2 = cf2; + } + } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: - chandef->width = NL80211_CHAN_WIDTH_160; + new.width = NL80211_CHAN_WIDTH_160; + new.center_freq1 = cf1; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - chandef->width = NL80211_CHAN_WIDTH_80P80; + new.width = NL80211_CHAN_WIDTH_80P80; + new.center_freq1 = cf1; + new.center_freq2 = cf2; break; default: - break; + return false; } - chandef->center_freq1 = - ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, - control_chan->band); - chandef->center_freq2 = - ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, - control_chan->band); + if (!cfg80211_chandef_valid(&new)) + return false; + + *chandef = new; + return true; } int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, @@ -2672,6 +2708,18 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, sband = local->hw.wiphy->bands[status->band]; bitrate = sband->bitrates[status->rate_idx].bitrate; ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift)); + + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + /* TODO: handle HT/VHT preambles */ + if (status->band == IEEE80211_BAND_5GHZ) { + ts += 20 << shift; + mpdu_offset += 2; + } else if (status->flag & RX_FLAG_SHORTPRE) { + ts += 96; + } else { + ts += 192; + } + } } rate = cfg80211_calculate_bitrate(&ri); @@ -3357,3 +3405,17 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, txqi->txq.ac = IEEE80211_AC_BE; } } + +void ieee80211_txq_get_depth(struct ieee80211_txq *txq, + unsigned long *frame_cnt, + unsigned long *byte_cnt) +{ + struct txq_info *txqi = to_txq_info(txq); + + if (frame_cnt) + *frame_cnt = txqi->queue.qlen; + + if (byte_cnt) + *byte_cnt = txqi->byte_cnt; +} +EXPORT_SYMBOL(ieee80211_txq_get_depth); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c38b2f07a919..89e04d55aa18 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -1,6 +1,9 @@ /* * VHT handling * + * Portions of this file + * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -278,6 +281,23 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, } sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + + /* If HT IE reported 3839 bytes only, stay with that size. */ + if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839) + return; + + switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + default: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } } enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) @@ -425,6 +445,43 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, return changed; } +void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt) +{ + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + + if (!sdata->vif.mu_mimo_owner) + return; + + if (!memcmp(mgmt->u.action.u.vht_group_notif.position, + bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) && + !memcmp(mgmt->u.action.u.vht_group_notif.membership, + bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) + return; + + memcpy(bss_conf->mu_group.membership, + mgmt->u.action.u.vht_group_notif.membership, + WLAN_MEMBERSHIP_LEN); + memcpy(bss_conf->mu_group.position, + mgmt->u.action.u.vht_group_notif.position, + WLAN_USER_POSITION_LEN); + + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS); +} + +void ieee80211_update_mu_groups(struct ieee80211_vif *vif, + const u8 *membership, const u8 *position) +{ + struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; + + if (WARN_ON_ONCE(!vif->mu_mimo_owner)) + return; + + memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); + memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); +} +EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); + void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum ieee80211_band band) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index d824c38971ed..18848258adde 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2008, Jouni Malinen <j@w1.fi> + * Copyright (C) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -183,7 +184,6 @@ mic_fail_no_key: return RX_DROP_UNUSABLE; } - static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; @@ -191,6 +191,7 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; int len, tail; + u64 pn; u8 *pos; if (info->control.hw_key && @@ -222,12 +223,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) return 0; /* Increase IV for the frame */ - spin_lock(&key->u.tkip.txlock); - key->u.tkip.tx.iv16++; - if (key->u.tkip.tx.iv16 == 0) - key->u.tkip.tx.iv32++; - pos = ieee80211_tkip_add_iv(pos, key); - spin_unlock(&key->u.tkip.txlock); + pn = atomic64_inc_return(&key->conf.tx_pn); + pos = ieee80211_tkip_add_iv(pos, &key->conf, pn); /* hwaccel - with software IV */ if (info->control.hw_key) diff --git a/net/mac802154/main.c b/net/mac802154/main.c index e8cab5bb80c6..87da85ae5a6b 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -218,7 +218,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw) tasklet_kill(&local->tasklet); flush_workqueue(local->workqueue); - destroy_workqueue(local->workqueue); rtnl_lock(); @@ -226,6 +225,7 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw) rtnl_unlock(); + destroy_workqueue(local->workqueue); wpan_phy_unregister(local->phy); } EXPORT_SYMBOL(ieee802154_unregister_hw); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index fb31aa87de81..644a8da6d4bd 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void) } module_exit(mpls_iptunnel_exit); +MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8c067e6663a1..95e757c377f9 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IP6_NF_IPTABLES != n + select NF_DUP_IPV6 if IPV6 ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 0328f7250693..299edc6add5a 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - INIT_LIST_HEAD(&ipvs->app_list); - proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); + proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops); return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - unregister_ip_vs_app(ipvs, NULL /* all */); - remove_proc_entry("ip_vs_app", net->proc_net); + remove_proc_entry("ip_vs_app", ipvs->net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e7c1b052c2a3..404b2a4f4b5b 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - pr_info("%s: enter\n", __func__); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services--; @@ -3947,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; int i, idx; /* Initialize rs_table */ @@ -3974,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) spin_lock_init(&ipvs->tot_stats.lock); - proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); - proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); - proc_create("ip_vs_stats_percpu", 0, net->proc_net, + proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net, &ip_vs_stats_percpu_fops); if (ip_vs_control_net_init_sysctl(ipvs)) @@ -3991,13 +3988,11 @@ err: void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); - remove_proc_entry("ip_vs_stats_percpu", net->proc_net); - remove_proc_entry("ip_vs_stats", net->proc_net); - remove_proc_entry("ip_vs", net->proc_net); + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + remove_proc_entry("ip_vs", ipvs->net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index a3f5cd9b3c4c..dc196a0f501d 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -531,8 +531,6 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); } return ret; } @@ -573,8 +571,6 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -595,8 +591,6 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 58882de06bd7..f60b4fdeeb8c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1412,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), } spin_unlock(lockp); local_bh_enable(); + cond_resched(); } for_each_possible_cpu(cpu) { @@ -1424,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), set_bit(IPS_DYING_BIT, &ct->status); } spin_unlock_bh(&pcpu->lock); + cond_resched(); } return NULL; found: @@ -1440,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net, struct nf_conn *ct; unsigned int bucket = 0; + might_sleep(); + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1448,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net, /* ... else the timer will get him soon. */ nf_ct_put(ct); + cond_resched(); } } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 8414ee1a0319..7ec69723940f 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -31,7 +31,6 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) skb_push(skb, skb->mac_len); skb->dev = dev; - skb_sender_cpu_clear(skb); dev_queue_xmit(skb); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a7ba23353dab..2278d9ab723b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) -{ - return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); -} -EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); - int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { @@ -311,14 +304,14 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } @@ -328,10 +321,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ @@ -406,7 +401,7 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); status |= NFNL_BATCH_FAILURE; goto done; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 94837d236ab0..2671b9deb103 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -312,7 +312,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) untimeout(h, timeout); } - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } local_bh_enable(); } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 8ca932057c13..11f81c8385fc 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); + skb = alloc_skb(n, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = nfnetlink_alloc_skb(net, pkt_size, - peer_portid, GFP_ATOMIC); + skb = alloc_skb(pkt_size, GFP_ATOMIC); } } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1d3936587ace..75429997ed41 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0, rem_len = 0; + size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; - rem_len = data_len - hlen; break; } @@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, - GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); return NULL; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c7808fc19719..c9743f78f219 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx, cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); @@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, GFP_ATOMIC); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 9aea747b43ea..81b5ad6165ac 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -17,7 +17,9 @@ #include <net/netfilter/nft_masq.h> const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_masq_policy); @@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); struct nft_masq *priv = nft_expr_priv(expr); int err; @@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx, if (err) return err; - if (tb[NFTA_MASQ_FLAGS] == NULL) - return 0; - - priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + if (tb[NFTA_MASQ_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + + if (tb[NFTA_MASQ_REG_PROTO_MIN]) { + priv->sreg_proto_min = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); + + err = nft_validate_register_load(priv->sreg_proto_min, plen); + if (err < 0) + return err; + + if (tb[NFTA_MASQ_REG_PROTO_MAX]) { + priv->sreg_proto_max = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); + + err = nft_validate_register_load(priv->sreg_proto_max, + plen); + if (err < 0) + return err; + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } + } return 0; } @@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_masq *priv = nft_expr_priv(expr); - if (priv->flags == 0) - return 0; - - if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + if (priv->flags != 0 && + nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN, + priv->sreg_proto_min) || + nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX, + priv->sreg_proto_max)) + goto nla_put_failure; + } + return 0; nla_put_failure: diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fe885bf271c5..16c50b0dd426 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -28,6 +28,8 @@ #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif + case NFT_META_PRANDOM: { + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + *dest = prandom_u32_state(state); + break; + } default: WARN_ON(1); goto err; @@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: len = IFNAMSIZ; break; + case NFT_META_PRANDOM: + prandom_init_once(&nft_prandom_state); + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c8a0b7da5ff4..d0cd2b9bf844 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -694,12 +694,45 @@ EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { - struct xt_table *t; + struct xt_table *t, *found = NULL; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; + + if (net == &init_net) + goto out; + + /* Table doesn't exist in this netns, re-try init */ + list_for_each_entry(t, &init_net.xt.tables[af], list) { + if (strcmp(t->name, name)) + continue; + if (!try_module_get(t->me)) + return NULL; + + mutex_unlock(&xt[af].mutex); + if (t->table_init(net) != 0) { + module_put(t->me); + return NULL; + } + + found = t; + + mutex_lock(&xt[af].mutex); + break; + } + + if (!found) + goto out; + + /* and once again: */ + list_for_each_entry(t, &net->xt.tables[af], list) + if (strcmp(t->name, name) == 0) + return t; + + module_put(found->me); + out: mutex_unlock(&xt[af].mutex); return NULL; } @@ -1170,20 +1203,20 @@ static const struct file_operations xt_target_ops = { #endif /* CONFIG_PROC_FS */ /** - * xt_hook_link - set up hooks for a new table + * xt_hook_ops_alloc - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * - * This function will take care of creating and registering the necessary - * Netfilter hooks for XT tables. + * This function will create the nf_hook_ops that the x_table needs + * to hand to xt_hook_link_net(). */ -struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +struct nf_hook_ops * +xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; - int ret; ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) @@ -1200,27 +1233,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) ++i; } - ret = nf_register_hooks(ops, num_hooks); - if (ret < 0) { - kfree(ops); - return ERR_PTR(ret); - } - return ops; } -EXPORT_SYMBOL_GPL(xt_hook_link); - -/** - * xt_hook_unlink - remove hooks for a table - * @ops: nf_hook_ops array as returned by nf_hook_link - * @hook_mask: the very same mask that was passed to nf_hook_link - */ -void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) -{ - nf_unregister_hooks(ops, hweight32(table->valid_hooks)); - kfree(ops); -} -EXPORT_SYMBOL_GPL(xt_hook_unlink); +EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_proto_init(struct net *net, u_int8_t af) { diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3eff7b67cdf2..6e57a3966dc5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) { .name = "TEE", .revision = 1, diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 4e3c3affd285..2455b69b5810 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -262,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) if (f->opt[optnum].kind == (*optp)) { __u32 len = f->opt[optnum].length; const __u8 *optend = optp + len; - int loop_cont = 0; fmatch = FMATCH_OK; @@ -275,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: - loop_cont = 1; break; } diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f0cb92f3ddaf..ada67422234b 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl { static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) -static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; -static struct netlbl_dom_map *netlbl_domhsh_def = NULL; +static struct netlbl_domhsh_tbl *netlbl_domhsh; +static struct netlbl_dom_map *netlbl_domhsh_def; /* * Domain Hash Table Helper Functions diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index b0380927f05f..9eaa9a1e8629 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg { static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) -static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; +static struct netlbl_unlhsh_tbl *netlbl_unlhsh; +static struct netlbl_unlhsh_iface *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ -static u8 netlabel_unlabel_acceptflg = 0; +static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig index 2c5e95e9bfbd..5d6e8c05b3d4 100644 --- a/net/netlink/Kconfig +++ b/net/netlink/Kconfig @@ -2,15 +2,6 @@ # Netlink Sockets # -config NETLINK_MMAP - bool "NETLINK: mmaped IO" - ---help--- - This option enables support for memory mapped netlink IO. This - reduces overhead by avoiding copying data between kernel- and - userspace. - - If unsure, say N. - config NETLINK_DIAG tristate "NETLINK: socket monitoring interface" default n diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1ffb34e253f..c8416792cce0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, dev_hold(dev); - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk) wake_up_interruptible(&nlk->wait); } -#ifdef CONFIG_NETLINK_MMAP -static bool netlink_rx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->rx_ring.pg_vec != NULL; -} - -static bool netlink_tx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->tx_ring.pg_vec != NULL; -} - -static __pure struct page *pgvec_to_page(const void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - else - return virt_to_page(addr); -} - -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; i++) { - if (pg_vec[i] != NULL) { - if (is_vmalloc_addr(pg_vec[i])) - vfree(pg_vec[i]); - else - free_pages((unsigned long)pg_vec[i], order); - } - } - kfree(pg_vec); -} - -static void *alloc_one_pg_vec_page(unsigned long order) -{ - void *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | - __GFP_NOWARN | __GFP_NORETRY; - - buffer = (void *)__get_free_pages(gfp_flags, order); - if (buffer != NULL) - return buffer; - - buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer != NULL) - return buffer; - - gfp_flags &= ~__GFP_NORETRY; - return (void *)__get_free_pages(gfp_flags, order); -} - -static void **alloc_pg_vec(struct netlink_sock *nlk, - struct nl_mmap_req *req, unsigned int order) -{ - unsigned int block_nr = req->nm_block_nr; - unsigned int i; - void **pg_vec; - - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); - if (pg_vec == NULL) - return NULL; - - for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (pg_vec[i] == NULL) - goto err1; - } - - return pg_vec; -err1: - free_pg_vec(pg_vec, order, block_nr); - return NULL; -} - - -static void -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, - unsigned int order) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct sk_buff_head *queue; - struct netlink_ring *ring; - - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); -} - -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool tx_ring) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - void **pg_vec = NULL; - unsigned int order = 0; - - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - - if (req->nm_block_nr) { - if (ring->pg_vec != NULL) - return -EBUSY; - - if ((int)req->nm_block_size <= 0) - return -EINVAL; - if (!PAGE_ALIGNED(req->nm_block_size)) - return -EINVAL; - if (req->nm_frame_size < NL_MMAP_HDRLEN) - return -EINVAL; - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) - return -EINVAL; - - ring->frames_per_block = req->nm_block_size / - req->nm_frame_size; - if (ring->frames_per_block == 0) - return -EINVAL; - if (ring->frames_per_block * req->nm_block_nr != - req->nm_frame_nr) - return -EINVAL; - - order = get_order(req->nm_block_size); - pg_vec = alloc_pg_vec(nlk, req, order); - if (pg_vec == NULL) - return -ENOMEM; - } else { - if (req->nm_frame_nr) - return -EINVAL; - } - - mutex_lock(&nlk->pg_vec_lock); - if (atomic_read(&nlk->mapped) == 0) { - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); - mutex_unlock(&nlk->pg_vec_lock); - return 0; - } - - mutex_unlock(&nlk->pg_vec_lock); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); - - return -EBUSY; -} - -static void netlink_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_inc(&nlk_sk(sk)->mapped); -} - -static void netlink_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_dec(&nlk_sk(sk)->mapped); -} - -static const struct vm_operations_struct netlink_mmap_ops = { - .open = netlink_mm_open, - .close = netlink_mm_close, -}; - -static int netlink_mmap(struct file *file, struct socket *sock, - struct vm_area_struct *vma) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - unsigned long start, size, expected; - unsigned int i; - int err = -EINVAL; - - if (vma->vm_pgoff) - return -EINVAL; - - mutex_lock(&nlk->pg_vec_lock); - - expected = 0; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; - } - - if (expected == 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size != expected) - goto out; - - start = vma->vm_start; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - - for (i = 0; i < ring->pg_vec_len; i++) { - struct page *page; - void *kaddr = ring->pg_vec[i]; - unsigned int pg_num; - - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { - page = pgvec_to_page(kaddr); - err = vm_insert_page(vma, start, page); - if (err < 0) - goto out; - start += PAGE_SIZE; - kaddr += PAGE_SIZE; - } - } - } - - atomic_inc(&nlk->mapped); - vma->vm_ops = &netlink_mmap_ops; - err = 0; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) -{ -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - struct page *p_start, *p_end; - - /* First page is flushed through netlink_{get,set}_status */ - p_start = pgvec_to_page(hdr + PAGE_SIZE); - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } -#endif -} - -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) -{ - smp_rmb(); - flush_dcache_page(pgvec_to_page(hdr)); - return hdr->nm_status; -} - -static void netlink_set_status(struct nl_mmap_hdr *hdr, - enum nl_mmap_status status) -{ - smp_mb(); - hdr->nm_status = status; - flush_dcache_page(pgvec_to_page(hdr)); -} - -static struct nl_mmap_hdr * -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) -{ - unsigned int pg_vec_pos, frame_off; - - pg_vec_pos = pos / ring->frames_per_block; - frame_off = pos % ring->frames_per_block; - - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); -} - -static struct nl_mmap_hdr * -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, - enum nl_mmap_status status) -{ - struct nl_mmap_hdr *hdr; - - hdr = __netlink_lookup_frame(ring, pos); - if (netlink_get_status(hdr) != status) - return NULL; - - return hdr; -} - -static struct nl_mmap_hdr * -netlink_current_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - return netlink_lookup_frame(ring, ring->head, status); -} - -static void netlink_increment_head(struct netlink_ring *ring) -{ - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; -} - -static void netlink_forward_ring(struct netlink_ring *ring) -{ - unsigned int head = ring->head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, ring->head); - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) - break; - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) - break; - netlink_increment_head(ring); - } while (ring->head != head); -} - -static bool netlink_has_valid_frame(struct netlink_ring *ring) -{ - unsigned int head = ring->head, pos = head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, pos); - if (hdr->nm_status == NL_MMAP_STATUS_VALID) - return true; - pos = pos != 0 ? pos - 1 : ring->frame_max; - } while (pos != head); - - return false; -} - -static bool netlink_dump_space(struct netlink_sock *nlk) -{ - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - unsigned int n; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - return false; - - n = ring->head + ring->frame_max / 2; - if (n > ring->frame_max) - n -= ring->frame_max; - - hdr = __netlink_lookup_frame(ring, n); - - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; -} - -static unsigned int netlink_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int mask; - int err; - - if (nlk->rx_ring.pg_vec != NULL) { - /* Memory mapped sockets don't call recvmsg(), so flow control - * for dumps is performed here. A dump is allowed to continue - * if at least half the ring is unused. - */ - while (nlk->cb_running && netlink_dump_space(nlk)) { - err = netlink_dump(sk); - if (err < 0) { - sk->sk_err = -err; - sk->sk_error_report(sk); - break; - } - } - netlink_rcv_wake(sk); - } - - mask = datagram_poll(file, sock, wait); - - /* We could already have received frames in the normal receive - * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, - * so if mask contains pollin/etc already, there's no point - * walking the ring. - */ - if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } - - spin_lock_bh(&sk->sk_write_queue.lock); - if (nlk->tx_ring.pg_vec) { - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLOUT | POLLWRNORM; - } - spin_unlock_bh(&sk->sk_write_queue.lock); - - return mask; -} - -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) -{ - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); -} - -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, - struct netlink_ring *ring, - struct nl_mmap_hdr *hdr) -{ - unsigned int size; - void *data; - - size = ring->frame_size - NL_MMAP_HDRLEN; - data = (void *)hdr + NL_MMAP_HDRLEN; - - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - - skb->destructor = netlink_skb_destructor; - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; - NETLINK_CB(skb).sk = sk; -} - -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, - u32 dst_portid, u32 dst_group, - struct scm_cookie *scm) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - struct sk_buff *skb; - unsigned int maxlen; - int err = 0, len = 0; - - mutex_lock(&nlk->pg_vec_lock); - - ring = &nlk->tx_ring; - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - - do { - unsigned int nm_len; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); - if (hdr == NULL) { - if (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending)) - schedule(); - continue; - } - - nm_len = ACCESS_ONCE(hdr->nm_len); - if (nm_len > maxlen) { - err = -EINVAL; - goto out; - } - - netlink_frame_flush_dcache(hdr, nm_len); - - skb = alloc_skb(nm_len, GFP_KERNEL); - if (skb == NULL) { - err = -ENOBUFS; - goto out; - } - __skb_put(skb, nm_len); - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - - netlink_increment_head(ring); - - NETLINK_CB(skb).portid = nlk->portid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = scm->creds; - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (unlikely(dst_group)) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_portid, dst_group, - GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_portid, - msg->msg_flags & MSG_DONTWAIT); - if (err < 0) - goto out; - len += err; - - } while (hdr != NULL || - (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending))); - - if (len > 0) - err = len; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) -{ - struct nl_mmap_hdr *hdr; - - hdr = netlink_mmap_hdr(skb); - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_frame_flush_dcache(hdr, hdr->nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; - kfree_skb(skb); -} - -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - - spin_lock_bh(&sk->sk_receive_queue.lock); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - kfree_skb(skb); - netlink_overrun(sk); - return; - } - netlink_increment_head(ring); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); -} - -#else /* CONFIG_NETLINK_MMAP */ -#define netlink_rx_is_mmaped(sk) false -#define netlink_tx_is_mmaped(sk) false -#define netlink_mmap sock_no_mmap -#define netlink_poll datagram_poll -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0 -#endif /* CONFIG_NETLINK_MMAP */ - static void netlink_skb_destructor(struct sk_buff *skb) { -#ifdef CONFIG_NETLINK_MMAP - struct nl_mmap_hdr *hdr; - struct netlink_ring *ring; - struct sock *sk; - - /* If a packet from the kernel to userspace was freed because of an - * error without being delivered to userspace, the kernel must reset - * the status. In the direction userspace to kernel, the status is - * always reset here after the packet was processed and freed. - */ - if (netlink_skb_is_mmaped(skb)) { - hdr = netlink_mmap_hdr(skb); - sk = NETLINK_CB(skb).sk; - - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - ring = &nlk_sk(sk)->tx_ring; - } else { - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - } - ring = &nlk_sk(sk)->rx_ring; - } - - WARN_ON(atomic_read(&ring->pending) == 0); - atomic_dec(&ring->pending); - sock_put(sk); - - skb->head = NULL; - } -#endif if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) @@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); -#ifdef CONFIG_NETLINK_MMAP - if (1) { - struct nl_mmap_req req; - - memset(&req, 0, sizeof(req)); - if (nlk->rx_ring.pg_vec) - __netlink_set_ring(sk, &req, false, NULL, 0); - memset(&req, 0, sizeof(req)); - if (nlk->tx_ring.pg_vec) - __netlink_set_ring(sk, &req, true, NULL, 0); - } -#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); -#ifdef CONFIG_NETLINK_MMAP - mutex_init(&nlk->pg_vec_lock); -#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state)) && - !netlink_skb_is_mmaped(skb)) { + test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) netlink_deliver_tap(skb); -#ifdef CONFIG_NETLINK_MMAP - if (netlink_skb_is_mmaped(skb)) - netlink_queue_mmaped_skb(sk, skb); - else if (netlink_rx_is_mmaped(sk)) - netlink_ring_set_copied(sk, skb); - else -#endif /* CONFIG_NETLINK_MMAP */ - skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } @@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) int delta; WARN_ON(skb->sk != NULL); - if (netlink_skb_is_mmaped(skb)) - return skb; - delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; @@ -1876,79 +1248,6 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, - unsigned int ldiff, u32 dst_portid, - gfp_t gfp_mask) -{ -#ifdef CONFIG_NETLINK_MMAP - unsigned int maxlen, linear_size; - struct sock *sk = NULL; - struct sk_buff *skb; - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - - sk = netlink_getsockbyportid(ssk, dst_portid); - if (IS_ERR(sk)) - goto out; - - ring = &nlk_sk(sk)->rx_ring; - /* fast-path without atomic ops for common case: non-mmaped receiver */ - if (ring->pg_vec == NULL) - goto out_put; - - /* We need to account the full linear size needed as a ring - * slot cannot have non-linear parts. - */ - linear_size = size + ldiff; - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) - goto out_put; - - skb = alloc_skb_head(gfp_mask); - if (skb == NULL) - goto err1; - - spin_lock_bh(&sk->sk_receive_queue.lock); - /* check again under lock */ - if (ring->pg_vec == NULL) - goto out_free; - - /* check again under lock */ - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < linear_size) - goto out_free; - - netlink_forward_ring(ring); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - goto err2; - - netlink_ring_setup_skb(skb, sk, ring, hdr); - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); - atomic_inc(&ring->pending); - netlink_increment_head(ring); - - spin_unlock_bh(&sk->sk_receive_queue.lock); - return skb; - -err2: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - netlink_overrun(sk); -err1: - sock_put(sk); - return NULL; - -out_free: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); -out_put: - sock_put(sk); -out: -#endif - return alloc_skb(size, gfp_mask); -} -EXPORT_SYMBOL_GPL(__netlink_alloc_skb); - int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -2225,8 +1524,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && - optlen >= sizeof(int) && + if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -2279,25 +1577,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } err = 0; break; -#ifdef CONFIG_NETLINK_MMAP - case NETLINK_RX_RING: - case NETLINK_TX_RING: { - struct nl_mmap_req req; - - /* Rings might consume more memory than queue limits, require - * CAP_NET_ADMIN. - */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (optlen < sizeof(req)) - return -EINVAL; - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - err = netlink_set_ring(sk, &req, - optname == NETLINK_TX_RING); - break; - } -#endif /* CONFIG_NETLINK_MMAP */ case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; @@ -2467,18 +1746,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) smp_rmb(); } - /* It's a really convoluted way for userland to ask for mmaped - * sendmsg(), but that's what we've got... - */ - if (netlink_tx_is_mmaped(sk) && - iter_is_iovec(&msg->msg_iter) && - msg->msg_iter.nr_segs == 1 && - msg->msg_iter.iov->iov_base == NULL) { - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, - &scm); - goto out; - } - err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -2794,8 +2061,7 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - if (!netlink_rx_is_mmaped(sk) && - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being @@ -2808,15 +2074,12 @@ static int netlink_dump(struct sock *sk) if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); + skb = alloc_skb(alloc_size, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL); + skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; @@ -2831,8 +2094,7 @@ static int netlink_dump(struct sock *sk) * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ - if (!netlink_rx_is_mmaped(sk)) - skb_reserve(skb, skb_tailroom(skb) - alloc_size); + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2884,16 +2146,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_sock *nlk; int ret; - /* Memory mapped dump requests need to be copied to avoid looping - * on the pending state in netlink_mmap_sendmsg() while the CB hold - * a reference to the skb. - */ - if (netlink_skb_is_mmaped(skb)) { - skb = skb_copy(skb, GFP_KERNEL); - if (skb == NULL) - return -ENOBUFS; - } else - atomic_inc(&skb->users); + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { @@ -2966,8 +2219,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); - skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), - NETLINK_CB(in_skb).portid, GFP_KERNEL); + skb = nlmsg_new(payload, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -3241,7 +2493,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = netlink_poll, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -3249,7 +2501,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = netlink_mmap, + .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 14437d9b1965..e68ef9ccd703 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -44,12 +44,6 @@ struct netlink_sock { int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); struct module *module; -#ifdef CONFIG_NETLINK_MMAP - struct mutex pg_vec_lock; - struct netlink_ring rx_ring; - struct netlink_ring tx_ring; - atomic_t mapped; -#endif /* CONFIG_NETLINK_MMAP */ struct rhash_head node; struct rcu_head rcu; @@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ -#ifdef CONFIG_NETLINK_MMAP - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -#else - return false; -#endif /* CONFIG_NETLINK_MMAP */ -} - struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 3ee63a3cff30..8dd836a8dd60 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -8,41 +8,6 @@ #include "af_netlink.h" -#ifdef CONFIG_NETLINK_MMAP -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, - struct sk_buff *nlskb) -{ - struct netlink_diag_ring ndr; - - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; - ndr.ndr_block_nr = ring->pg_vec_len; - ndr.ndr_frame_size = ring->frame_size; - ndr.ndr_frame_nr = ring->frame_max + 1; - - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); -} - -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - int ret; - - mutex_lock(&nlk->pg_vec_lock); - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); - if (!ret) - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, - nlskb); - mutex_unlock(&nlk->pg_vec_lock); - - return ret; -} -#else -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - return 0; -} -#endif - static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && - sk_diag_put_rings_cfg(sk, skb)) - goto out_nlmsg_trim; - nlmsg_end(skb, nlh); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0ffd721126e7..a09132a69869 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -463,26 +463,6 @@ int genl_unregister_family(struct genl_family *family) EXPORT_SYMBOL(genl_unregister_family); /** - * genlmsg_new_unicast - Allocate generic netlink message for unicast - * @payload: size of the message payload - * @info: information on destination - * @flags: the type of memory to allocate - * - * Allocates a new sk_buff large enough to cover the specified payload - * plus required Netlink headers. Will check receiving socket for - * memory mapped i/o capability and use it if enabled. Will fall back - * to non-mapped skb if message size exceeds the frame size of the ring. - */ -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags) -{ - size_t len = nlmsg_total_size(genlmsg_total_size(payload)); - - return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); -} -EXPORT_SYMBOL_GPL(genlmsg_new_unicast); - -/** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to @@ -642,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; - info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9f6654..cd5fd9d728a7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -10,6 +10,7 @@ config OPENVSWITCH select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2d59df521915..e9dd47b2a85b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, - MPLS_HLEN, 0)); + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; @@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, mask->eth_dst); - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); @@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); memcpy(skb->data, &data->l2_data, data->l2_len); - ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); ovs_vport_send(vport, skb); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d6f7fe92744a..0cc66a4e492d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - struct genl_info info = { - .dst_sk = ovs_dp_get_net(dp)->genl_sock, - .snd_portid = upcall_info->portid, - }; size_t len; unsigned int hlen; int err, dp_ifindex; @@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, hlen = skb->len; len = upcall_msg_size(upcall_info, hlen); - user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1100,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int error; + int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; - /* Extract key. */ - error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR(log, "Flow key attribute not present in set flow."); - goto error; - } - ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); - ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], - a[OVS_FLOW_ATTR_MASK], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow set message rejected, Key attribute missing."); + error = -EINVAL; + } if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, + "Flow key attribute not present in set flow."); + error = -EINVAL; + goto error; + } + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, log); if (IS_ERR(acts)) { @@ -1481,9 +1483,9 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { - return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ @@ -1536,7 +1538,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1657,7 +1659,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1690,7 +1692,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1723,7 +1725,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1912,6 +1914,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1977,6 +2002,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2043,8 +2074,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2066,7 +2099,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9fdc1..427e39a045cf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a45ca2d..58b8efc23668 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a792f..83a5534abd31 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6a6adf314363..4e3972344aa6 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb) return; skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1605691d9414..5eb7694348b5 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -90,7 +90,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) int err; struct vxlan_config conf = { .no_share = true, - .flags = VXLAN_F_COLLECT_METADATA, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, }; if (!options) { diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index c10899cb9040..f01f28a567ad 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv) int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); -} - static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b7e7851ddc5d..1ecfa710ca98 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -557,9 +557,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, { struct net_device *dev; unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; int err; - u32 speed; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); @@ -567,19 +566,19 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } - err = __ethtool_get_settings(dev, &ecmd); - speed = ethtool_cmd_speed(&ecmd); + err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (!err) { /* * If the link speed is so slow you don't really * need to worry about perf anyways */ - if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) { return DEFAULT_PRB_RETIRE_TOV; } else { msec = 1; - div = speed / 1000; + div = ecmd.base.speed / 1000; } } @@ -1916,6 +1915,10 @@ retry: goto retry; } + if (!dev_validate_header(dev, skb->data, len)) { + err = -EINVAL; + goto out_unlock; + } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; @@ -2394,18 +2397,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static bool ll_header_truncated(const struct net_device *dev, int len) -{ - /* net device doesn't like empty head */ - if (unlikely(len < dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", - current->comm, len, dev->hard_header_len); - return true; - } - - return false; -} - static void tpacket_set_protocol(const struct net_device *dev, struct sk_buff *skb) { @@ -2523,16 +2514,20 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { + int hdrlen = min_t(int, copylen, tp_len); + skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); - err = skb_store_bits(skb, 0, data, copylen); + err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; + if (!dev_validate_header(dev, skb->data, hdrlen)) + return -EINVAL; if (!skb->protocol) tpacket_set_protocol(dev, skb); - data += copylen; - to_write -= copylen; + data += hdrlen; + to_write -= hdrlen; } offset = offset_in_page(data); @@ -2704,13 +2699,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } - if (dev->hard_header_len) { - if (ll_header_truncated(dev, tp_len)) { - tp_len = -EINVAL; - goto tpacket_error; - } - copylen = max_t(int, copylen, dev->hard_header_len); - } + copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), @@ -2906,9 +2895,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; - } else { - if (ll_header_truncated(dev, len)) - goto out_free; } /* Returns -EFAULT on error */ @@ -2916,6 +2902,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; + if (sock->type == SOCK_RAW && + !dev_validate_header(dev, skb->data, len)) { + err = -EINVAL; + goto out_free; + } + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && diff --git a/net/rds/Kconfig b/net/rds/Kconfig index f2c670ba7b9b..bffde4b46c5d 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -4,14 +4,13 @@ config RDS depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, - sequenced delivery of datagrams over Infiniband, iWARP, - or TCP. + sequenced delivery of datagrams over Infiniband or TCP. config RDS_RDMA - tristate "RDS over Infiniband and iWARP" + tristate "RDS over Infiniband" depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS ---help--- - Allow RDS to use Infiniband and iWARP as a transport. + Allow RDS to use Infiniband as a transport. This transport supports RDMA operations. config RDS_TCP diff --git a/net/rds/Makefile b/net/rds/Makefile index 56d3f6023ced..0e72bec1529f 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ obj-$(CONFIG_RDS_RDMA) += rds_rdma.o rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ - ib_sysctl.o ib_rdma.o \ - iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ - iw_sysctl.o iw_rdma.o + ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o obj-$(CONFIG_RDS_TCP) += rds_tcp.o diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b5476aebd68d..6beaeb1138f3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } +static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, + int optlen) +{ + int val, valbool; + + if (optlen != sizeof(int)) + return -EFAULT; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val ? 1 : 0; + + if (valbool) + sock_set_flag(sk, SOCK_RCVTSTAMP); + else + sock_reset_flag(sk, SOCK_RCVTSTAMP); + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_set_transport(rs, optval, optlen); release_sock(sock->sk); break; + case SO_TIMESTAMP: + lock_sock(sock->sk); + ret = rds_enable_recvtstamp(sock->sk, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } diff --git a/net/rds/ib.c b/net/rds/ib.c index 9481d55ff6cb..b5342fddaf98 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -42,15 +42,16 @@ #include "rds.h" #include "ib.h" +#include "ib_mr.h" -unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; -unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; +unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(rds_ib_fmr_1m_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); -module_param(rds_ib_fmr_8k_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); +module_param(rds_ib_mr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); +module_param(rds_ib_mr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); + rds_ibdev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), - rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; - rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + rds_ibdev->max_8k_mrs = device->attrs.max_mr ? min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), - rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; + rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; @@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, - rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, - rds_ibdev->max_8k_fmrs); + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs, + rds_ibdev->max_8k_mrs); + + pr_info("RDS/IB: %s: %s supported and preferred\n", + device->name, + rds_ibdev->use_fastreg ? "FRMR" : "FMR"); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); @@ -364,7 +375,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); - rds_ib_fmr_exit(); + rds_ib_mr_exit(); } struct rds_transport rds_ib_transport = { @@ -400,13 +411,13 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = rds_ib_fmr_init(); + ret = rds_ib_mr_init(); if (ret) goto out; ret = ib_register_client(&rds_ib_client); if (ret) - goto out_fmr_exit; + goto out_mr_exit; ret = rds_ib_sysctl_init(); if (ret) @@ -430,8 +441,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); -out_fmr_exit: - rds_ib_fmr_exit(); +out_mr_exit: + rds_ib_mr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index b3fdebb57460..627fb79aee65 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,17 +9,12 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_1M_POOL_SIZE (8192 / 2) -#define RDS_FMR_1M_MSG_SIZE 256 -#define RDS_FMR_8K_MSG_SIZE 2 -#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) -#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) - #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_FR_WR 512 #define RDS_IB_DEFAULT_RETRY_COUNT 2 @@ -28,7 +23,6 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 #define RDS_IB_WC_MAX 32 -#define RDS_IB_SEND_OP BIT_ULL(63) extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -129,6 +123,9 @@ struct rds_ib_connection { struct ib_wc i_send_wc[RDS_IB_WC_MAX]; struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + /* To control the number of wrs from fastreg */ + atomic_t i_fastreg_wrs; + /* interrupt handling */ struct tasklet_struct i_send_tasklet; struct tasklet_struct i_recv_tasklet; @@ -207,12 +204,16 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - unsigned int max_fmrs; + bool has_fmr; + bool has_fr; + bool use_fastreg; + + unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_8k_pool; unsigned int fmr_max_remaps; - unsigned int max_8k_fmrs; - unsigned int max_1m_fmrs; + unsigned int max_8k_mrs; + unsigned int max_1m_mrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -266,6 +267,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_pool_flush; uint64_t s_ib_rdma_mr_1m_pool_wait; uint64_t s_ib_rdma_mr_1m_pool_depleted; + uint64_t s_ib_rdma_mr_8k_reused; + uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int rds_ib_fmr_1m_pool_size; -extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, - int npages); -void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); -void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_ib_sync_mr(void *trans_private, int dir); -void rds_ib_free_mr(void *trans_private, int invalidate); -void rds_ib_flush_mrs(void); -int rds_ib_fmr_init(void); -void rds_ib_fmr_exit(void); +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index da5a7fb98c77..8764970f0c24 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_recv_tasklet); } -static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, - struct ib_wc *wcs, - struct rds_ib_ack_state *ack_state) +static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs) { - int nr; - int i; + int nr, i; struct ib_wc *wc; while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { @@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, (unsigned long long)wc->wr_id, wc->status, wc->byte_len, be32_to_cpu(wc->ex.imm_data)); - if (wc->wr_id & RDS_IB_SEND_OP) + if (wc->wr_id <= ic->i_send_ring.w_nr || + wc->wr_id == RDS_IB_ACK_WR_ID) rds_ib_send_cqe_handler(ic, wc); else - rds_ib_recv_cqe_handler(ic, wc, ack_state); + rds_ib_mr_cqe_handler(ic, wc); + } } } @@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state; rds_ib_stats_inc(s_ib_tasklet_call); - memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); if (rds_conn_up(conn) && (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || @@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_send_xmit(ic->conn); } +static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr, i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + static void rds_ib_tasklet_fn_recv(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; @@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); if (state.ack_next_valid) rds_ib_set_ack(ic, state.ack_next, state.ack_required); @@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; - int ret; + int ret, fr_queue_space; /* * It's normal to see a null device if an incoming connection races @@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!rds_ibdev) return -EOPNOTSUPP; + /* The fr_queue_space is currently set to 512, to add extra space on + * completion queue and send queue. This extra space is used for FRMR + * registration and invalidation work requests + */ + fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - cq_attr.cqe = ic->i_send_ring.w_nr + 1; + cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, @@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.event_handler = rds_ib_qp_event_handler; attr.qp_context = conn; /* + 1 to allow for the single ack message */ - attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1; attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; attr.cap.max_send_sge = rds_ibdev->max_sge; attr.cap.max_recv_sge = RDS_IB_RECV_SGE; @@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.qp_type = IB_QPT_RC; attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; + atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) */ wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && - (atomic_read(&ic->i_signaled_sends) == 0)); + (atomic_read(&ic->i_signaled_sends) == 0) && + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c new file mode 100644 index 000000000000..4fe8f4fec4ee --- /dev/null +++ b/net/rds/ib_fmr.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + fmr = &ibmr->u.fmr; + fmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); + if (IS_ERR(fmr->fmr)) { + err = PTR_ERR(fmr->fmr); + fmr->fmr = NULL; + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + return ibmr; + +out_no_cigar: + if (ibmr) { + if (fmr->fmr) + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_fmr *fmr = &ibmr->u.fmr; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + if (page_cnt > ibmr->pool->fmr_attr.max_pages) + return -EINVAL; + + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + + ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev, + struct scatterlist *sg, + unsigned long nents, + u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int ret; + + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->device = rds_ibdev; + fmr = &ibmr->u.fmr; + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key = fmr->fmr->rkey; + else + rds_ib_free_mr(ibmr, 0); + + return ibmr; +} + +void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_fmr *fmr; + LIST_HEAD(fmr_list); + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + fmr = &ibmr->u.fmr; + list_add(&fmr->fmr->list, &fmr_list); + } + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + fmr = &ibmr->u.fmr; + *unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || + ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c new file mode 100644 index 000000000000..93ff038ea9d1 --- /dev/null +++ b/net/rds/ib_frmr.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, + int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + frmr = &ibmr->u.frmr; + frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG, + pool->fmr_attr.max_pages); + if (IS_ERR(frmr->mr)) { + pr_warn("RDS/IB: %s failed to allocate MR", __func__); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + if (atomic_read(&pool->item_count) > pool->max_items_soft) + pool->max_items_soft = pool->max_items; + + frmr->fr_state = FRMR_IS_FREE; + return ibmr; + +out_no_cigar: + kfree(ibmr); + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (drop) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 5) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); +} + +static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct ib_send_wr *failed_wr; + struct ib_reg_wr reg_wr; + int ret; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE); + if (unlikely(ret != ibmr->sg_len)) + return ret < 0 ? ret : -EINVAL; + + /* Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); + frmr->fr_state = FRMR_IS_INUSE; + + memset(®_wr, 0, sizeof(reg_wr)); + reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.num_sge = 0; + reg_wr.mr = frmr->mr; + reg_wr.key = frmr->mr->rkey; + reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + reg_wr.wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = ®_wr.wr; + ret = ib_post_send(ibmr->ic->i_cm_id->qp, ®_wr.wr, &failed_wr); + WARN_ON(failed_wr != ®_wr.wr); + if (unlikely(ret)) { + /* Failure here can be because of -ENOMEM as well */ + frmr->fr_state = FRMR_IS_STALE; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + if (printk_ratelimit()) + pr_warn("RDS/IB: %s returned error(%d)\n", + __func__, ret); + } + return ret; +} + +static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_mr_pool *pool, + struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int sg_len) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + int i; + u32 len; + int ret = 0; + + /* We want to teardown old ibmr values here and fill it up with + * new sg values + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = sg; + ibmr->sg_len = sg_len; + ibmr->sg_dma_len = 0; + frmr->sg_byte_len = 0; + WARN_ON(ibmr->sg_dma_len); + ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + if (unlikely(!ibmr->sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + frmr->sg_byte_len = 0; + frmr->dma_npages = 0; + len = 0; + + ret = -EINVAL; + for (i = 0; i < ibmr->sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]); + u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]); + + frmr->sg_byte_len += dma_len; + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + goto out_unmap; + else + ++frmr->dma_npages; + } + + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < ibmr->sg_dma_len - 1) + goto out_unmap; + else + ++frmr->dma_npages; + } + + len += dma_len; + } + frmr->dma_npages += len >> PAGE_SHIFT; + + if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { + ret = -EMSGSIZE; + goto out_unmap; + } + + ret = rds_ib_post_reg_frmr(ibmr); + if (ret) + goto out_unmap; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + + return ret; + +out_unmap: + ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + return ret; +} + +static int rds_ib_post_inv(struct rds_ib_mr *ibmr) +{ + struct ib_send_wr *s_wr, *failed_wr; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id; + int ret = -EINVAL; + + if (!i_cm_id || !i_cm_id->qp || !frmr->mr) + goto out; + + if (frmr->fr_state != FRMR_IS_INUSE) + goto out; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + frmr->fr_inv = true; + s_wr = &frmr->fr_wr; + + memset(s_wr, 0, sizeof(*s_wr)); + s_wr->wr_id = (unsigned long)(void *)ibmr; + s_wr->opcode = IB_WR_LOCAL_INV; + s_wr->ex.invalidate_rkey = frmr->mr->rkey; + s_wr->send_flags = IB_SEND_SIGNALED; + + failed_wr = s_wr; + ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr); + WARN_ON(failed_wr != s_wr); + if (unlikely(ret)) { + frmr->fr_state = FRMR_IS_STALE; + frmr->fr_inv = false; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); + goto out; + } +out: + return ret; +} + +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) +{ + struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (wc->status != IB_WC_SUCCESS) { + frmr->fr_state = FRMR_IS_STALE; + if (rds_conn_up(ic->conn)) + rds_ib_conn_error(ic->conn, + "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n", + &ic->conn->c_laddr, + &ic->conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status), + wc->vendor_err); + } + + if (frmr->fr_inv) { + frmr->fr_state = FRMR_IS_FREE; + frmr->fr_inv = false; + } + + atomic_inc(&ic->i_fastreg_wrs); +} + +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_frmr *frmr; + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + if (ibmr->sg_dma_len) + ret |= rds_ib_post_inv(ibmr); + } + if (ret) + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + *unpinned += ibmr->sg_len; + frmr = &ibmr->u.frmr; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || frmr->fr_state == FRMR_IS_STALE) { + /* Don't de-allocate if the MR is not free yet */ + if (frmr->fr_state == FRMR_IS_INUSE) + continue; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + if (frmr->mr) + ib_dereg_mr(frmr->mr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int ret; + + do { + if (ibmr) + rds_ib_free_frmr(ibmr, true); + ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + frmr = &ibmr->u.frmr; + } while (frmr->fr_state != FRMR_IS_FREE); + + ibmr->ic = ic; + ibmr->device = rds_ibdev; + ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents); + if (ret == 0) { + *key = frmr->mr->rkey; + } else { + rds_ib_free_frmr(ibmr, false); + ibmr = ERR_PTR(ret); + } + + return ibmr; +} + +void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (frmr->fr_state == FRMR_IS_STALE) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h new file mode 100644 index 000000000000..1c754f4acbe5 --- /dev/null +++ b/net/rds/ib_mr.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _RDS_IB_MR_H +#define _RDS_IB_MR_H + +#include <linux/kernel.h> + +#include "rds.h" +#include "ib.h" + +#define RDS_MR_1M_POOL_SIZE (8192 / 2) +#define RDS_MR_1M_MSG_SIZE 256 +#define RDS_MR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1)) +#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) + +struct rds_ib_fmr { + struct ib_fmr *fmr; + u64 *dma; +}; + +enum rds_ib_fr_state { + FRMR_IS_FREE, /* mr invalidated & ready for use */ + FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */ + FRMR_IS_STALE, /* Stale MR and needs to be dropped */ +}; + +struct rds_ib_frmr { + struct ib_mr *mr; + enum rds_ib_fr_state fr_state; + bool fr_inv; + struct ib_send_wr fr_wr; + unsigned int dma_npages; + unsigned int sg_byte_len; +}; + +/* This is stored as mr->r_trans_private. */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct rds_ib_connection *ic; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + int sg_dma_len; + + union { + struct rds_ib_fmr fmr; + struct rds_ib_frmr frmr; + } u; +}; + +/* Our own little MR pool */ +struct rds_ib_mr_pool { + unsigned int pool_type; + struct mutex flush_lock; /* serialize fmr invalidate */ + struct delayed_work flush_worker; /* flush worker */ + + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + + struct llist_head drop_list; /* MRs not reached max_maps */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* unused & unmapped MRs */ + wait_queue_head_t flush_wait; + + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; + bool use_fastreg; +}; + +extern struct workqueue_struct *rds_ib_mr_wq; +extern unsigned int rds_ib_mr_1m_pool_size; +extern unsigned int rds_ib_mr_8k_pool_size; +extern bool prefer_frmr; + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); +int rds_ib_mr_init(void); +void rds_ib_mr_exit(void); + +void __rds_ib_teardown_mr(struct rds_ib_mr *); +void rds_ib_teardown_mr(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); +int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *, + struct scatterlist *, unsigned int); +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, + unsigned long, u32 *); +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *); +void rds_ib_unreg_fmr(struct list_head *, unsigned int *, + unsigned long *, unsigned int); +void rds_ib_free_fmr_list(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key); +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal); +void rds_ib_free_frmr_list(struct rds_ib_mr *); +#endif diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a2340748ec86..f7164ac1ffc1 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -35,78 +35,13 @@ #include <linux/rculist.h> #include <linux/llist.h> -#include "rds.h" -#include "ib.h" +#include "ib_mr.h" + +struct workqueue_struct *rds_ib_mr_wq; static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 -/* - * This is stored as mr->r_trans_private. - */ -struct rds_ib_mr { - struct rds_ib_device *device; - struct rds_ib_mr_pool *pool; - struct ib_fmr *fmr; - - struct llist_node llnode; - - /* unmap_list is for freeing */ - struct list_head unmap_list; - unsigned int remap_count; - - struct scatterlist *sg; - unsigned int sg_len; - u64 *dma; - int sg_dma_len; -}; - -/* - * Our own little FMR pool - */ -struct rds_ib_mr_pool { - unsigned int pool_type; - struct mutex flush_lock; /* serialize fmr invalidate */ - struct delayed_work flush_worker; /* flush worker */ - - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - - struct llist_head drop_list; /* MRs that have reached their max_maps limit */ - struct llist_head free_list; /* unused MRs */ - struct llist_head clean_list; /* global unused & unamapped MRs */ - wait_queue_head_t flush_wait; - - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - struct ib_fmr_attr fmr_attr; -}; - -static struct workqueue_struct *rds_ib_fmr_wq; - -int rds_ib_fmr_init(void) -{ - rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); - if (!rds_ib_fmr_wq) - return -ENOMEM; - return 0; -} - -/* By the time this is called all the IB devices should have been torn down and - * had their pools freed. As each pool is freed its work struct is waited on, - * so the pool flushing work queue should be idle by the time we get here. - */ -void rds_ib_fmr_exit(void) -{ - destroy_workqueue(rds_ib_fmr_wq); -} - -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); -static void rds_ib_mr_pool_flush_worker(struct work_struct *work); - static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; @@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, - int pool_type) -{ - struct rds_ib_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->pool_type = pool_type; - init_llist_head(&pool->free_list); - init_llist_head(&pool->drop_list); - init_llist_head(&pool->clean_list); - mutex_init(&pool->flush_lock); - init_waitqueue_head(&pool->flush_wait); - INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - - if (pool_type == RDS_IB_MR_1M_POOL) { - /* +1 allows for unaligned MRs */ - pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; - pool->max_items = RDS_FMR_1M_POOL_SIZE; - } else { - /* pool_type == RDS_IB_MR_8K_POOL */ - pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; - pool->max_items = RDS_FMR_8K_POOL_SIZE; - } - - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - - return pool; -} - void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; @@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) -{ - cancel_delayed_work_sync(&pool->flush_worker); - rds_ib_flush_mr_pool(pool, 1, NULL); - WARN_ON(atomic_read(&pool->item_count)); - WARN_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; @@ -297,8 +188,13 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) flag = this_cpu_ptr(&clean_list_grace); set_bit(CLEAN_LIST_BUSY_BIT, flag); ret = llist_del_first(&pool->clean_list); - if (ret) + if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); + } clear_bit(CLEAN_LIST_BUSY_BIT, flag); preempt_enable(); @@ -317,190 +213,6 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, - int npages) -{ - struct rds_ib_mr_pool *pool; - struct rds_ib_mr *ibmr = NULL; - int err = 0, iter = 0; - - if (npages <= RDS_FMR_8K_MSG_SIZE) - pool = rds_ibdev->mr_8k_pool; - else - pool = rds_ibdev->mr_1m_pool; - - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); - - /* Switch pools if one of the pool is reaching upper limit */ - if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - pool = rds_ibdev->mr_1m_pool; - else - pool = rds_ibdev->mr_8k_pool; - } - - while (1) { - ibmr = rds_ib_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); - rds_ib_flush_mr_pool(pool, 0, &ibmr); - if (ibmr) - return ibmr; - } - - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE| - IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); - if (IS_ERR(ibmr->fmr)) { - err = PTR_ERR(ibmr->fmr); - ibmr->fmr = NULL; - printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); - goto out_no_cigar; - } - - ibmr->pool = pool; - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); - - return ibmr; - -out_no_cigar: - if (ibmr) { - if (ibmr->fmr) - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, - struct scatterlist *sg, unsigned int nents) -{ - struct ib_device *dev = rds_ibdev->dev; - struct scatterlist *scat = sg; - u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt, sg_dma_len; - int i, j; - int ret; - - sg_dma_len = ib_dma_map_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - if (unlikely(!sg_dma_len)) { - printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); - return -EBUSY; - } - - len = 0; - page_cnt = 0; - - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - if (dma_addr & ~PAGE_MASK) { - if (i > 0) - return -EINVAL; - else - ++page_cnt; - } - if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) - return -EINVAL; - else - ++page_cnt; - } - - len += dma_len; - } - - page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) - return -EINVAL; - - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) - return -ENOMEM; - - page_cnt = 0; - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - for (j = 0; j < dma_len; j += PAGE_SIZE) - dma_pages[page_cnt++] = - (dma_addr & PAGE_MASK) + j; - } - - ret = ib_map_phys_fmr(ibmr->fmr, - dma_pages, page_cnt, io_addr); - if (ret) - goto out; - - /* Success - we successfully remapped the MR, so we can - * safely tear down the old mapping. */ - rds_ib_teardown_mr(ibmr); - - ibmr->sg = scat; - ibmr->sg_len = nents; - ibmr->sg_dma_len = sg_dma_len; - ibmr->remap_count++; - - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_used); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_used); - ret = 0; - -out: - kfree(dma_pages); - - return ret; -} - void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; @@ -518,7 +230,7 @@ void rds_ib_sync_mr(void *trans_private, int direction) } } -static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; @@ -549,7 +261,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) } } -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; @@ -623,17 +335,15 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { - struct rds_ib_mr *ibmr, *next; + struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; - int ret = 0; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); @@ -643,7 +353,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -655,7 +365,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (llist_empty(&pool->clean_list)) schedule(); - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -667,7 +377,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, mutex_lock(&pool->flush_lock); if (ibmr_ret) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; @@ -687,30 +397,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (list_empty(&unmap_list)) goto out; - /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, unmap_list) - list_add(&ibmr->fmr->list, &fmr_list); - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); - - /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { - unpinned += ibmr->sg_len; - __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || - ibmr->remap_count >= pool->fmr_attr.max_maps) { - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_free); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_free); - list_del(&ibmr->unmap_list); - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - nfreed++; - } - } + if (pool->use_fastreg) + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); + else + rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { /* we have to make sure that none of the things we're about @@ -743,7 +433,47 @@ out: if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: - return ret; + return 0; +} + +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + int iter = 0; + + if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + while (1) { + ibmr = rds_ib_reuse_mr(pool); + if (ibmr) + return ibmr; + + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); + + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; + } + + return ibmr; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) @@ -762,10 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - if (ibmr->remap_count >= pool->fmr_attr.max_maps) - llist_add(&ibmr->llnode, &pool->drop_list); + if (rds_ibdev->use_fastreg) + rds_ib_free_frmr_list(ibmr); else - llist_add(&ibmr->llnode, &pool->free_list); + rds_ib_free_fmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); @@ -773,7 +503,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -782,7 +512,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_delayed_work(rds_ib_fmr_wq, + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } @@ -810,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; + struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); @@ -823,29 +554,81 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); - if (IS_ERR(ibmr)) { - rds_ib_dev_put(rds_ibdev); - return ibmr; - } - - ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->fmr->rkey; + if (rds_ibdev->use_fastreg) + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); else - printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); - - ibmr->device = rds_ibdev; - rds_ibdev = NULL; + ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); + if (ibmr) + rds_ibdev = NULL; out: - if (ret) { - if (ibmr) - rds_ib_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } + if (!ibmr) + pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); + if (rds_ibdev) rds_ib_dev_put(rds_ibdev); + return ibmr; } +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->pool_type = pool_type; + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); + mutex_init(&pool->flush_lock); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; + pool->max_items = RDS_MR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; + pool->max_items = RDS_MR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; + pool->use_fastreg = rds_ibdev->use_fastreg; + + return pool; +} + +int rds_ib_mr_init(void) +{ + rds_ib_mr_wq = create_workqueue("rds_mr_flushd"); + if (!rds_ib_mr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_mr_exit(void) +{ + destroy_workqueue(rds_ib_mr_wq); +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index eac30bf486d7..f27d2c82b036 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i | RDS_IB_SEND_OP; + send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; @@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) oldest = rds_ib_ring_oldest(&ic->i_send_ring); - completed = rds_ib_ring_completed(&ic->i_send_ring, - (wc->wr_id & ~RDS_IB_SEND_OP), - oldest); + completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest); for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d77e04473056..7e78dca1f252 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", + "ib_rdma_mr_8k_reused", + "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; diff --git a/net/rds/iw.c b/net/rds/iw.c deleted file mode 100644 index f4a9fff829e0..000000000000 --- a/net/rds/iw.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_arp.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/module.h> - -#include "rds.h" -#include "iw.h" - -unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; -unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ - -module_param(fastreg_pool_size, int, 0444); -MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); -module_param(fastreg_message_size, int, 0444); -MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); - -struct list_head rds_iw_devices; - -/* NOTE: if also grabbing iwdev lock, grab this first */ -DEFINE_SPINLOCK(iw_nodev_conns_lock); -LIST_HEAD(iw_nodev_conns); - -static void rds_iw_add_one(struct ib_device *device) -{ - struct rds_iw_device *rds_iwdev; - - /* Only handle iwarp devices */ - if (device->node_type != RDMA_NODE_RNIC) - return; - - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); - if (!rds_iwdev) - return; - - spin_lock_init(&rds_iwdev->spinlock); - - rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = device->attrs.max_qp_wr; - rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); - - rds_iwdev->dev = device; - rds_iwdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_iwdev->pd)) - goto free_dev; - - if (!rds_iwdev->dma_local_lkey) { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_iwdev->mr)) - goto err_pd; - } else - rds_iwdev->mr = NULL; - - rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); - if (IS_ERR(rds_iwdev->mr_pool)) { - rds_iwdev->mr_pool = NULL; - goto err_mr; - } - - INIT_LIST_HEAD(&rds_iwdev->cm_id_list); - INIT_LIST_HEAD(&rds_iwdev->conn_list); - list_add_tail(&rds_iwdev->list, &rds_iw_devices); - - ib_set_client_data(device, &rds_iw_client, rds_iwdev); - return; - -err_mr: - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); -err_pd: - ib_dealloc_pd(rds_iwdev->pd); -free_dev: - kfree(rds_iwdev); -} - -static void rds_iw_remove_one(struct ib_device *device, void *client_data) -{ - struct rds_iw_device *rds_iwdev = client_data; - struct rds_iw_cm_id *i_cm_id, *next; - - if (!rds_iwdev) - return; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - } - spin_unlock_irq(&rds_iwdev->spinlock); - - rds_iw_destroy_conns(rds_iwdev); - - if (rds_iwdev->mr_pool) - rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); - - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); - - ib_dealloc_pd(rds_iwdev->pd); - - list_del(&rds_iwdev->list); - kfree(rds_iwdev); -} - -struct ib_client rds_iw_client = { - .name = "rds_iw", - .add = rds_iw_add_one, - .remove = rds_iw_remove_one -}; - -static int rds_iw_conn_info_visitor(struct rds_connection *conn, - void *buffer) -{ - struct rds_info_rdma_connection *iinfo = buffer; - struct rds_iw_connection *ic; - - /* We will only ever look at IB transports */ - if (conn->c_trans != &rds_iw_transport) - return 0; - - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; - - memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); - memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); - if (rds_conn_state(conn) == RDS_CONN_UP) { - struct rds_iw_device *rds_iwdev; - struct rdma_dev_addr *dev_addr; - - ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - iinfo->max_send_wr = ic->i_send_ring.w_nr; - iinfo->max_recv_wr = ic->i_recv_ring.w_nr; - iinfo->max_send_sge = rds_iwdev->max_sge; - rds_iw_get_mr_info(rds_iwdev, iinfo); - } - return 1; -} - -static void rds_iw_ic_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens) -{ - rds_for_each_conn_info(sock, len, iter, lens, - rds_iw_conn_info_visitor, - sizeof(struct rds_info_rdma_connection)); -} - - -/* - * Early RDS/IB was built to only bind to an address if there is an IPoIB - * device with that address set. - * - * If it were me, I'd advocate for something more flexible. Sending and - * receiving should be device-agnostic. Transports would try and maintain - * connections between peers who have messages queued. Userspace would be - * allowed to influence which paths have priority. We could call userspace - * asserting this policy "routing". - */ -static int rds_iw_laddr_check(struct net *net, __be32 addr) -{ - int ret; - struct rdma_cm_id *cm_id; - struct sockaddr_in sin; - - /* Create a CMA ID and try to bind it. This catches both - * IB and iWARP capable NICs. - */ - cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); - - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - - /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); - /* due to this, we will claim to support IB devices unless we - check node_type. */ - if (ret || !cm_id->device || - cm_id->device->node_type != RDMA_NODE_RNIC) - ret = -EADDRNOTAVAIL; - - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); - - rdma_destroy_id(cm_id); - - return ret; -} - -void rds_iw_exit(void) -{ - rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - rds_iw_destroy_nodev_conns(); - ib_unregister_client(&rds_iw_client); - rds_iw_sysctl_exit(); - rds_iw_recv_exit(); - rds_trans_unregister(&rds_iw_transport); -} - -struct rds_transport rds_iw_transport = { - .laddr_check = rds_iw_laddr_check, - .xmit_complete = rds_iw_xmit_complete, - .xmit = rds_iw_xmit, - .xmit_rdma = rds_iw_xmit_rdma, - .recv = rds_iw_recv, - .conn_alloc = rds_iw_conn_alloc, - .conn_free = rds_iw_conn_free, - .conn_connect = rds_iw_conn_connect, - .conn_shutdown = rds_iw_conn_shutdown, - .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_free = rds_iw_inc_free, - .cm_initiate_connect = rds_iw_cm_initiate_connect, - .cm_handle_connect = rds_iw_cm_handle_connect, - .cm_connect_complete = rds_iw_cm_connect_complete, - .stats_info_copy = rds_iw_stats_info_copy, - .exit = rds_iw_exit, - .get_mr = rds_iw_get_mr, - .sync_mr = rds_iw_sync_mr, - .free_mr = rds_iw_free_mr, - .flush_mrs = rds_iw_flush_mrs, - .t_owner = THIS_MODULE, - .t_name = "iwarp", - .t_type = RDS_TRANS_IWARP, - .t_prefer_loopback = 1, -}; - -int rds_iw_init(void) -{ - int ret; - - INIT_LIST_HEAD(&rds_iw_devices); - - ret = ib_register_client(&rds_iw_client); - if (ret) - goto out; - - ret = rds_iw_sysctl_init(); - if (ret) - goto out_ibreg; - - ret = rds_iw_recv_init(); - if (ret) - goto out_sysctl; - - ret = rds_trans_register(&rds_iw_transport); - if (ret) - goto out_recv; - - rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - - goto out; - -out_recv: - rds_iw_recv_exit(); -out_sysctl: - rds_iw_sysctl_exit(); -out_ibreg: - ib_unregister_client(&rds_iw_client); -out: - return ret; -} - -MODULE_LICENSE("GPL"); - diff --git a/net/rds/iw.h b/net/rds/iw.h deleted file mode 100644 index 5af01d1758b3..000000000000 --- a/net/rds/iw.h +++ /dev/null @@ -1,398 +0,0 @@ -#ifndef _RDS_IW_H -#define _RDS_IW_H - -#include <linux/interrupt.h> -#include <rdma/ib_verbs.h> -#include <rdma/rdma_cm.h> -#include "rds.h" -#include "rdma_transport.h" - -#define RDS_FASTREG_SIZE 20 -#define RDS_FASTREG_POOL_SIZE 2048 - -#define RDS_IW_MAX_SGE 8 -#define RDS_IW_RECV_SGE 2 - -#define RDS_IW_DEFAULT_RECV_WR 1024 -#define RDS_IW_DEFAULT_SEND_WR 256 - -#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ - -extern struct list_head rds_iw_devices; - -/* - * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to - * try and minimize the amount of memory tied up both the device and - * socket receive queues. - */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) -struct rds_page_frag { - struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; -}; - -struct rds_iw_incoming { - struct list_head ii_frags; - struct rds_incoming ii_inc; -}; - -struct rds_iw_connect_private { - /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - __be32 dp_reserved1; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ -}; - -struct rds_iw_scatterlist { - struct scatterlist *list; - unsigned int len; - int dma_len; - unsigned int dma_npages; - unsigned int bytes; -}; - -struct rds_iw_mapping { - spinlock_t m_lock; /* protect the mapping struct */ - struct list_head m_list; - struct rds_iw_mr *m_mr; - uint32_t m_rkey; - struct rds_iw_scatterlist m_sg; -}; - -struct rds_iw_send_work { - struct rds_message *s_rm; - - /* We should really put these into a union: */ - struct rm_rdma_op *s_op; - struct rds_iw_mapping *s_mapping; - struct ib_mr *s_mr; - unsigned char s_remap_count; - - union { - struct ib_send_wr s_send_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_reg_wr s_reg_wr; - }; - struct ib_sge s_sge[RDS_IW_MAX_SGE]; - unsigned long s_queued; -}; - -struct rds_iw_recv_work { - struct rds_iw_incoming *r_iwinc; - struct rds_page_frag *r_frag; - struct ib_recv_wr r_wr; - struct ib_sge r_sge[2]; -}; - -struct rds_iw_work_ring { - u32 w_nr; - u32 w_alloc_ptr; - u32 w_alloc_ctr; - u32 w_free_ptr; - atomic_t w_free_ctr; -}; - -struct rds_iw_device; - -struct rds_iw_connection { - - struct list_head iw_node; - struct rds_iw_device *rds_iwdev; - struct rds_connection *conn; - - /* alphabet soup, IBTA style */ - struct rdma_cm_id *i_cm_id; - struct ib_pd *i_pd; - struct ib_mr *i_mr; - struct ib_cq *i_send_cq; - struct ib_cq *i_recv_cq; - - /* tx */ - struct rds_iw_work_ring i_send_ring; - struct rds_message *i_rm; - struct rds_header *i_send_hdrs; - u64 i_send_hdrs_dma; - struct rds_iw_send_work *i_sends; - - /* rx */ - struct tasklet_struct i_recv_tasklet; - struct mutex i_recv_mutex; - struct rds_iw_work_ring i_recv_ring; - struct rds_iw_incoming *i_iwinc; - u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - u64 i_recv_hdrs_dma; - struct rds_iw_recv_work *i_recvs; - struct rds_page_frag i_frag; - u64 i_ack_recv; /* last ACK received */ - - /* sending acks */ - unsigned long i_ack_flags; -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_t i_ack_next; /* next ACK to send */ -#else - spinlock_t i_ack_lock; /* protect i_ack_next */ - u64 i_ack_next; /* next ACK to send */ -#endif - struct rds_header *i_ack; - struct ib_send_wr i_ack_wr; - struct ib_sge i_ack_sge; - u64 i_ack_dma; - unsigned long i_ack_queued; - - /* Flow control related information - * - * Our algorithm uses a pair variables that we need to access - * atomically - one for the send credits, and one posted - * recv credits we need to transfer to remote. - * Rather than protect them using a slow spinlock, we put both into - * a single atomic_t and update it using cmpxchg - */ - atomic_t i_credits; - - /* Protocol version specific information */ - unsigned int i_flowctl:1; /* enable/disable flow ctl */ - unsigned int i_dma_local_lkey:1; - unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ - /* Batched completions */ - unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; -}; - -/* This assumes that atomic_t is at least 32 bits */ -#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_GET_POST_CREDITS(v) ((v) >> 16) -#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_SET_POST_CREDITS(v) ((v) << 16) - -struct rds_iw_cm_id { - struct list_head list; - struct rdma_cm_id *cm_id; -}; - -struct rds_iw_device { - struct list_head list; - struct list_head cm_id_list; - struct list_head conn_list; - struct ib_device *dev; - struct ib_pd *pd; - struct ib_mr *mr; - struct rds_iw_mr_pool *mr_pool; - int max_sge; - unsigned int max_wrs; - unsigned int dma_local_lkey:1; - spinlock_t spinlock; /* protect the above */ -}; - -/* bits for i_ack_flags */ -#define IB_ACK_IN_FLIGHT 0 -#define IB_ACK_REQUESTED 1 - -/* Magic WR_ID for ACKs */ -#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) -#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL) -#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) - -struct rds_iw_statistics { - uint64_t s_iw_connect_raced; - uint64_t s_iw_listen_closed_stale; - uint64_t s_iw_tx_cq_call; - uint64_t s_iw_tx_cq_event; - uint64_t s_iw_tx_ring_full; - uint64_t s_iw_tx_throttle; - uint64_t s_iw_tx_sg_mapping_failure; - uint64_t s_iw_tx_stalled; - uint64_t s_iw_tx_credit_updates; - uint64_t s_iw_rx_cq_call; - uint64_t s_iw_rx_cq_event; - uint64_t s_iw_rx_ring_empty; - uint64_t s_iw_rx_refill_from_cq; - uint64_t s_iw_rx_refill_from_thread; - uint64_t s_iw_rx_alloc_limit; - uint64_t s_iw_rx_credit_updates; - uint64_t s_iw_ack_sent; - uint64_t s_iw_ack_send_failure; - uint64_t s_iw_ack_send_delayed; - uint64_t s_iw_ack_send_piggybacked; - uint64_t s_iw_ack_received; - uint64_t s_iw_rdma_mr_alloc; - uint64_t s_iw_rdma_mr_free; - uint64_t s_iw_rdma_mr_used; - uint64_t s_iw_rdma_mr_pool_flush; - uint64_t s_iw_rdma_mr_pool_wait; - uint64_t s_iw_rdma_mr_pool_depleted; -}; - -extern struct workqueue_struct *rds_iw_wq; - -/* - * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h - * doesn't define it. - */ -static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu - -static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device - -static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) -{ - return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; -} - -/* ib.c */ -extern struct rds_transport rds_iw_transport; -extern struct ib_client rds_iw_client; - -extern unsigned int fastreg_pool_size; -extern unsigned int fastreg_message_size; - -extern spinlock_t iw_nodev_conns_lock; -extern struct list_head iw_nodev_conns; - -/* ib_cm.c */ -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); -void rds_iw_conn_free(void *arg); -int rds_iw_conn_connect(struct rds_connection *conn); -void rds_iw_conn_shutdown(struct rds_connection *conn); -void rds_iw_state_change(struct sock *sk); -int rds_iw_listen_init(void); -void rds_iw_listen_stop(void); -void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); -void rds_iw_cm_connect_complete(struct rds_connection *conn, - struct rdma_cm_event *event); - - -#define rds_iw_conn_error(conn, fmt...) \ - __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) - -/* ib_rdma.c */ -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_iw_destroy_nodev_conns(void) -{ - __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); -} -static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) -{ - __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); -} -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_iw_sync_mr(void *trans_private, int dir); -void rds_iw_free_mr(void *trans_private, int invalidate); -void rds_iw_flush_mrs(void); - -/* ib_recv.c */ -int rds_iw_recv_init(void); -void rds_iw_recv_exit(void); -int rds_iw_recv(struct rds_connection *conn); -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_iw_inc_free(struct rds_incoming *inc); -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_recv_tasklet_fn(unsigned long data); -void rds_iw_recv_init_ring(struct rds_iw_connection *ic); -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); -void rds_iw_recv_init_ack(struct rds_iw_connection *ic); -void rds_iw_attempt_ack(struct rds_iw_connection *ic); -void rds_iw_ack_send_complete(struct rds_iw_connection *ic); -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); - -/* ib_ring.c */ -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); -int rds_iw_ring_empty(struct rds_iw_work_ring *ring); -int rds_iw_ring_low(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); -extern wait_queue_head_t rds_iw_ring_empty_wait; - -/* ib_send.c */ -void rds_iw_xmit_complete(struct rds_connection *conn); -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_send_init_ring(struct rds_iw_connection *ic); -void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted, int max_posted); - -/* ib_stats.c */ -DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); -#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail); - -/* ib_sysctl.c */ -int rds_iw_sysctl_init(void); -void rds_iw_sysctl_exit(void); -extern unsigned long rds_iw_sysctl_max_send_wr; -extern unsigned long rds_iw_sysctl_max_recv_wr; -extern unsigned long rds_iw_sysctl_max_unsig_wrs; -extern unsigned long rds_iw_sysctl_max_unsig_bytes; -extern unsigned long rds_iw_sysctl_max_recv_allocation; -extern unsigned int rds_iw_sysctl_flow_control; - -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - */ -static inline struct ib_sge * -rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[0]; -} - -static inline struct ib_sge * -rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[1]; -} - -#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c deleted file mode 100644 index aea4c911bc76..000000000000 --- a/net/rds/iw_cm.c +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -/* - * Set the selected protocol version - */ -static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) -{ - conn->c_version = version; -} - -/* - * Set up flow control - */ -static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (rds_iw_sysctl_flow_control && credits != 0) { - /* We're doing flow control */ - ic->i_flowctl = 1; - rds_iw_send_add_credits(conn, credits); - } else { - ic->i_flowctl = 0; - } -} - -/* - * Connection established. - * We get here for both outgoing and incoming connection. - */ -void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = NULL; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - int err; - - if (event->param.conn.private_data_len) { - dp = event->param.conn.private_data; - - rds_iw_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - } - - /* update ib_device with this local ipaddr & conn */ - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); - if (err) - printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); - rds_iw_add_conn(rds_iwdev, conn); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", - &conn->c_laddr, &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); - - rds_connect_complete(conn); -} - -static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_iw_connect_private *dp, - u32 protocol_version) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; - - if (dp) { - memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = rds_iw_piggyb_ack(ic); - - /* Advertise flow control */ - if (ic->i_flowctl) { - unsigned int credits; - - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); - } - - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); - } -} - -static void rds_iw_cq_event_handler(struct ib_event *event, void *data) -{ - rdsdebug("event %u data %p\n", event->event, data); -} - -static void rds_iw_qp_event_handler(struct ib_event *event, void *data) -{ - struct rds_connection *conn = data; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); - - switch (event->event) { - case IB_EVENT_COMM_EST: - rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); - break; - case IB_EVENT_QP_REQ_ERR: - case IB_EVENT_QP_FATAL: - default: - rdsdebug("Fatal QP Event %u " - "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, - &conn->c_faddr); - rds_conn_drop(conn); - break; - } -} - -/* - * Create a QP - */ -static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, - struct rds_iw_device *rds_iwdev, - struct rds_iw_work_ring *send_ring, - void (*send_cq_handler)(struct ib_cq *, void *), - struct rds_iw_work_ring *recv_ring, - void (*recv_cq_handler)(struct ib_cq *, void *), - void *context) -{ - struct ib_device *dev = rds_iwdev->dev; - struct ib_cq_init_attr cq_attr = {}; - unsigned int send_size, recv_size; - int ret; - - /* The offset of 1 is to accommodate the additional ACK WR. */ - send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); - recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); - rds_iw_ring_resize(send_ring, send_size - 1); - rds_iw_ring_resize(recv_ring, recv_size - 1); - - memset(attr, 0, sizeof(*attr)); - attr->event_handler = rds_iw_qp_event_handler; - attr->qp_context = context; - attr->cap.max_send_wr = send_size; - attr->cap.max_recv_wr = recv_size; - attr->cap.max_send_sge = rds_iwdev->max_sge; - attr->cap.max_recv_sge = RDS_IW_RECV_SGE; - attr->sq_sig_type = IB_SIGNAL_REQ_WR; - attr->qp_type = IB_QPT_RC; - - cq_attr.cqe = send_size; - attr->send_cq = ib_create_cq(dev, send_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->send_cq)) { - ret = PTR_ERR(attr->send_cq); - attr->send_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - cq_attr.cqe = recv_size; - attr->recv_cq = ib_create_cq(dev, recv_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->recv_cq)) { - ret = PTR_ERR(attr->recv_cq); - attr->recv_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); - if (ret) { - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); - if (ret) { - rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; - } - -out: - if (ret) { - if (attr->send_cq) - ib_destroy_cq(attr->send_cq); - if (attr->recv_cq) - ib_destroy_cq(attr->recv_cq); - } - return ret; -} - -/* - * This needs to be very careful to not leave IS_ERR pointers around for - * cleanup to trip over. - */ -static int rds_iw_setup_qp(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct ib_qp_init_attr attr; - struct rds_iw_device *rds_iwdev; - int ret; - - /* rds_iw_add_one creates a rds_iw_device object per IB device, - * and allocates a protection domain, memory range and MR pool - * for each. If that fails for any reason, it will not register - * the rds_iwdev at all. - */ - rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (!rds_iwdev) { - printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", - dev->name); - return -EOPNOTSUPP; - } - - /* Protection domain and memory range */ - ic->i_pd = rds_iwdev->pd; - ic->i_mr = rds_iwdev->mr; - - ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, - &ic->i_send_ring, rds_iw_send_cq_comp_handler, - &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, - conn); - if (ret < 0) - goto out; - - ic->i_send_cq = attr.send_cq; - ic->i_recv_cq = attr.recv_cq; - - /* - * XXX this can fail if max_*_wr is too large? Are we supposed - * to back off until we get a value that the hardware can support? - */ - ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); - if (ret) { - rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; - } - - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); - if (!ic->i_send_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; - } - - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (!ic->i_recv_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; - } - - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); - if (!ic->i_ack) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; - } - - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (!ic->i_sends) { - ret = -ENOMEM; - rdsdebug("send allocation failed\n"); - goto out; - } - rds_iw_send_init_ring(ic); - - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (!ic->i_recvs) { - ret = -ENOMEM; - rdsdebug("recv allocation failed\n"); - goto out; - } - - rds_iw_recv_init_ring(ic); - rds_iw_recv_init_ack(ic); - - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, - ic->i_send_cq, ic->i_recv_cq); - -out: - return ret; -} - -static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) -{ - u16 common; - u32 version = 0; - - /* rdma_cm private data is odd - when there is any private data in the - * request, we will be given a pretty large buffer without telling us the - * original size. The only way to tell the difference is by looking at - * the contents, which are initialized to zero. - * If the protocol version fields aren't set, this is a connection attempt - * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) - return RDS_PROTOCOL_3_0; - - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 3 && common) { - version = RDS_PROTOCOL_3_0; - while ((common >>= 1) != 0) - version++; - } - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); - return version; -} - -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = event->param.conn.private_data; - struct rds_iw_connect_private dp_rep; - struct rds_connection *conn = NULL; - struct rds_iw_connection *ic = NULL; - struct rdma_conn_param conn_param; - struct rds_iw_device *rds_iwdev; - u32 version; - int err, destroy = 1; - - /* Check whether the remote protocol version matches ours. */ - version = rds_iw_protocol_compatible(dp); - if (!version) - goto out; - - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", - &dp->dp_saddr, &dp->dp_daddr, - RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - - /* RDS/IW is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_iw_transport, GFP_KERNEL); - if (IS_ERR(conn)) { - rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); - conn = NULL; - goto out; - } - - /* - * The connection request may occur while the - * previous connection exist, e.g. in case of failover. - * But as connections may be initiated simultaneously - * by both hosts, we have a random backoff mechanism - - * see the comment above rds_queue_reconnect() - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) { - rdsdebug("incoming connect while connecting\n"); - rds_conn_drop(conn); - rds_iw_stats_inc(s_iw_listen_closed_stale); - } else - if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { - /* Wait and see - our connect may still be succeeding */ - rds_iw_stats_inc(s_iw_connect_raced); - } - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - ic = conn->c_transport_data; - - rds_iw_set_protocol(conn, version); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - BUG_ON(cm_id->context); - BUG_ON(ic->i_cm_id); - - ic->i_cm_id = cm_id; - cm_id->context = conn; - - rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - /* We got halfway through setting up the ib_connection, if we - * fail now, we have to take the long route out of this mess. */ - destroy = 0; - - err = rds_iw_setup_qp(conn); - if (err) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); - - /* rdma_accept() calls rdma_reject() internally if it fails */ - err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { - rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; - -out: - rdma_reject(cm_id, NULL, 0); - return destroy; -} - - -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) -{ - struct rds_connection *conn = cm_id->context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rdma_conn_param conn_param; - struct rds_iw_connect_private dp; - int ret; - - /* If the peer doesn't do protocol negotiation, we must - * default to RDSv3.0 */ - rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); - ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ - - ret = rds_iw_setup_qp(conn); - if (ret) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - - ret = rdma_connect(cm_id, &conn_param); - if (ret) - rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); - -out: - /* Beware - returning non-zero tells the rdma_cm to destroy - * the cm_id. We should certainly not do it as long as we still - * "own" the cm_id. */ - if (ret) { - struct rds_iw_connection *ic = conn->c_transport_data; - - if (ic->i_cm_id == cm_id) - ret = 0; - } - return ret; -} - -int rds_iw_conn_connect(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - struct sockaddr_in src, dest; - int ret; - - /* XXX I wonder what affect the port space has */ - /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(ic->i_cm_id)) { - ret = PTR_ERR(ic->i_cm_id); - ic->i_cm_id = NULL; - rdsdebug("rdma_create_id() failed: %d\n", ret); - goto out; - } - - rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); - - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); - - /* First, bind to the local address and device. */ - ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); - if (ret) { - rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", - &conn->c_laddr, ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - goto out; - } - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); - - ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, - (struct sockaddr *)&dest, - RDS_RDMA_RESOLVE_TIMEOUT_MS); - if (ret) { - rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, - ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - } - -out: - return ret; -} - -/* - * This is so careful about only cleaning up resources that were built up - * so that it can be called at any point during startup. In fact it - * can be called multiple times for a given connection. - */ -void rds_iw_conn_shutdown(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int err = 0; - struct ib_qp_attr qp_attr; - - rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, - ic->i_pd, ic->i_send_cq, ic->i_recv_cq, - ic->i_cm_id ? ic->i_cm_id->qp : NULL); - - if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); - err = rdma_disconnect(ic->i_cm_id); - if (err) { - /* Actually this may happen quite frequently, when - * an outgoing connect raced with an incoming connect. - */ - rdsdebug("failed to disconnect, cm: %p err %d\n", - ic->i_cm_id, err); - } - - if (ic->i_cm_id->qp) { - qp_attr.qp_state = IB_QPS_ERR; - ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); - } - - wait_event(rds_iw_ring_empty_wait, - rds_iw_ring_empty(&ic->i_send_ring) && - rds_iw_ring_empty(&ic->i_recv_ring)); - - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); - - if (ic->i_sends) - rds_iw_send_clear_ring(ic); - if (ic->i_recvs) - rds_iw_recv_clear_ring(ic); - - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - - /* - * If associated with an rds_iw_device: - * Move connection back to the nodev list. - * Remove cm_id from the device cm_id list. - */ - if (ic->rds_iwdev) - rds_iw_remove_conn(ic->rds_iwdev, conn); - - rdma_destroy_id(ic->i_cm_id); - - ic->i_cm_id = NULL; - ic->i_pd = NULL; - ic->i_mr = NULL; - ic->i_send_cq = NULL; - ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; - } - BUG_ON(ic->rds_iwdev); - - /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; - } - - /* Clear the ACK state */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_set(&ic->i_ack_next, 0); -#else - ic->i_ack_next = 0; -#endif - ic->i_ack_recv = 0; - - /* Clear flow control state */ - ic->i_flowctl = 0; - atomic_set(&ic->i_credits, 0); - - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - if (ic->i_iwinc) { - rds_inc_put(&ic->i_iwinc->ii_inc); - ic->i_iwinc = NULL; - } - - vfree(ic->i_sends); - ic->i_sends = NULL; - vfree(ic->i_recvs); - ic->i_recvs = NULL; - rdsdebug("shutdown complete\n"); -} - -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) -{ - struct rds_iw_connection *ic; - unsigned long flags; - - /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), gfp); - if (!ic) - return -ENOMEM; - - INIT_LIST_HEAD(&ic->iw_node); - tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, - (unsigned long) ic); - mutex_init(&ic->i_recv_mutex); -#ifndef KERNEL_HAS_ATOMIC64 - spin_lock_init(&ic->i_ack_lock); -#endif - - /* - * rds_iw_conn_shutdown() waits for these to be emptied so they - * must be initialized before it can be called. - */ - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - ic->conn = conn; - conn->c_transport_data = ic; - - spin_lock_irqsave(&iw_nodev_conns_lock, flags); - list_add_tail(&ic->iw_node, &iw_nodev_conns); - spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); - - - rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); - return 0; -} - -/* - * Free a connection. Connection must be shut down and not set for reconnect. - */ -void rds_iw_conn_free(void *arg) -{ - struct rds_iw_connection *ic = arg; - spinlock_t *lock_ptr; - - rdsdebug("ic %p\n", ic); - - /* - * Conn is either on a dev's list or on the nodev list. - * A race with shutdown() or connect() would cause problems - * (since rds_iwdev would change) but that should never happen. - */ - lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; - - spin_lock_irq(lock_ptr); - list_del(&ic->iw_node); - spin_unlock_irq(lock_ptr); - - kfree(ic); -} - -/* - * An error occurred on the connection - */ -void -__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) -{ - va_list ap; - - rds_conn_drop(conn); - - va_start(ap, fmt); - vprintk(fmt, ap); - va_end(ap); -} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c deleted file mode 100644 index b09a40c1adce..000000000000 --- a/net/rds/iw_rdma.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - - -/* - * This is stored as mr->r_trans_private. - */ -struct rds_iw_mr { - struct rds_iw_device *device; - struct rds_iw_mr_pool *pool; - struct rdma_cm_id *cm_id; - - struct ib_mr *mr; - - struct rds_iw_mapping mapping; - unsigned char remap_count; -}; - -/* - * Our own little MR pool - */ -struct rds_iw_mr_pool { - struct rds_iw_device *device; /* back ptr to the device that owns us */ - - struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ - - spinlock_t list_lock; /* protect variables below */ - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - struct list_head dirty_list; /* dirty mappings */ - struct list_head clean_list; /* unused & unamapped MRs */ - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_message_size; /* in pages */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - int max_pages; -}; - -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); -static void rds_iw_mr_pool_flush_worker(struct work_struct *work); -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, unsigned int nents); -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned); -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); - -static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, - struct rds_iw_device **rds_iwdev, - struct rdma_cm_id **cm_id) -{ - struct rds_iw_device *iwdev; - struct rds_iw_cm_id *i_cm_id; - - *rds_iwdev = NULL; - *cm_id = NULL; - - list_for_each_entry(iwdev, &rds_iw_devices, list) { - spin_lock_irq(&iwdev->spinlock); - list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { - struct sockaddr_in *src_addr, *dst_addr; - - src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; - - rdsdebug("local ipaddr = %x port %d, " - "remote ipaddr = %x port %d" - "..looking for %x port %d, " - "remote ipaddr = %x port %d\n", - src_addr->sin_addr.s_addr, - src_addr->sin_port, - dst_addr->sin_addr.s_addr, - dst_addr->sin_port, - src->sin_addr.s_addr, - src->sin_port, - dst->sin_addr.s_addr, - dst->sin_port); -#ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && - src_addr->sin_port == src->sin_port && - dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && - dst_addr->sin_port == dst->sin_port) { -#else - /* FIXME - needs to compare the local and remote - * ipaddr/port tuple, but the ipaddr is the only - * available information in the rds_sock (as the rest are - * zero'ed. It doesn't appear to be properly populated - * during connection setup... - */ - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { -#endif - spin_unlock_irq(&iwdev->spinlock); - *rds_iwdev = iwdev; - *cm_id = i_cm_id->cm_id; - return 0; - } - } - spin_unlock_irq(&iwdev->spinlock); - } - - return 1; -} - -static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); - if (!i_cm_id) - return -ENOMEM; - - i_cm_id->cm_id = cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); - spin_unlock_irq(&rds_iwdev->spinlock); - - return 0; -} - -static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, - struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { - if (i_cm_id->cm_id == cm_id) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - break; - } - } - spin_unlock_irq(&rds_iwdev->spinlock); -} - - -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct sockaddr_in *src_addr, *dst_addr; - struct rds_iw_device *rds_iwdev_old; - struct rdma_cm_id *pcm_id; - int rc; - - src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - - rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); - if (rc) - rds_iw_remove_cm_id(rds_iwdev, cm_id); - - return rds_iw_add_cm_id(rds_iwdev, cm_id); -} - -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* conn was previously on the nodev_conns_list */ - spin_lock_irq(&iw_nodev_conns_lock); - BUG_ON(list_empty(&iw_nodev_conns)); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - - spin_lock(&rds_iwdev->spinlock); - list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock(&rds_iwdev->spinlock); - spin_unlock_irq(&iw_nodev_conns_lock); - - ic->rds_iwdev = rds_iwdev; -} - -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* place conn on nodev_conns_list */ - spin_lock(&iw_nodev_conns_lock); - - spin_lock_irq(&rds_iwdev->spinlock); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - spin_unlock_irq(&rds_iwdev->spinlock); - - list_add_tail(&ic->iw_node, &iw_nodev_conns); - - spin_unlock(&iw_nodev_conns_lock); - - rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); - ic->rds_iwdev = NULL; -} - -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) -{ - struct rds_iw_connection *ic, *_ic; - LIST_HEAD(tmp_list); - - /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); - - list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) - rds_conn_destroy(ic->conn); -} - -static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, - struct scatterlist *list, unsigned int sg_len) -{ - sg->list = list; - sg->len = sg_len; - sg->dma_len = 0; - sg->dma_npages = 0; - sg->bytes = 0; -} - -static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, - struct rds_iw_scatterlist *sg) -{ - struct ib_device *dev = rds_iwdev->dev; - int i, ret; - - WARN_ON(sg->dma_len); - - sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - if (unlikely(!sg->dma_len)) { - printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); - return -EBUSY; - } - - sg->bytes = 0; - sg->dma_npages = 0; - - ret = -EINVAL; - for (i = 0; i < sg->dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); - u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); - u64 end_addr; - - sg->bytes += dma_len; - - end_addr = dma_addr + dma_len; - if (dma_addr & PAGE_MASK) { - if (i > 0) - goto out_unmap; - dma_addr &= ~PAGE_MASK; - } - if (end_addr & PAGE_MASK) { - if (i < sg->dma_len - 1) - goto out_unmap; - end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; - } - - sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; - } - - /* Now gather the dma addrs into one list */ - if (sg->dma_npages > fastreg_message_size) - goto out_unmap; - - - - return 0; - -out_unmap: - ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - sg->dma_len = 0; - return ret; -} - - -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); - return ERR_PTR(-ENOMEM); - } - - pool->device = rds_iwdev; - INIT_LIST_HEAD(&pool->dirty_list); - INIT_LIST_HEAD(&pool->clean_list); - mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); - - pool->max_message_size = fastreg_message_size; - pool->max_items = fastreg_pool_size; - pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; - pool->max_pages = fastreg_message_size; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ - pool->max_items_soft = pool->max_items * 3 / 4; - - return pool; -} - -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->max_pages; -} - -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) -{ - flush_workqueue(rds_wq); - rds_iw_flush_mr_pool(pool, 1); - BUG_ON(atomic_read(&pool->item_count)); - BUG_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) -{ - struct rds_iw_mr *ibmr = NULL; - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); - list_del_init(&ibmr->mapping.m_list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); - - return ibmr; -} - -static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - struct rds_iw_mr *ibmr = NULL; - int err = 0, iter = 0; - - while (1) { - ibmr = rds_iw_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); - rds_iw_flush_mr_pool(pool, 0); - } - - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - spin_lock_init(&ibmr->mapping.m_lock); - INIT_LIST_HEAD(&ibmr->mapping.m_list); - ibmr->mapping.m_mr = ibmr; - - err = rds_iw_init_reg(pool, ibmr); - if (err) - goto out_no_cigar; - - rds_iw_stats_inc(s_iw_rdma_mr_alloc); - return ibmr; - -out_no_cigar: - if (ibmr) { - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -void rds_iw_sync_mr(void *trans_private, int direction) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_device *rds_iwdev = ibmr->device; - - switch (direction) { - case DMA_FROM_DEVICE: - ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - case DMA_TO_DEVICE: - ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - } -} - -/* - * Flush our pool of MRs. - * At a minimum, all currently unused MRs are unmapped. - * If the number of MRs allocated exceeds the limit, we also try - * to free as many MRs as needed to get back to this limit. - */ -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) -{ - struct rds_iw_mr *ibmr, *next; - LIST_HEAD(unmap_list); - LIST_HEAD(kill_list); - unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; - - rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); - - mutex_lock(&pool->flush_lock); - - spin_lock_irqsave(&pool->list_lock, flags); - /* Get the list of all mappings to be destroyed */ - list_splice_init(&pool->dirty_list, &unmap_list); - if (free_all) - list_splice_init(&pool->clean_list, &kill_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - - /* Batched invalidate of dirty MRs. - * For FMR based MRs, the mappings on the unmap list are - * actually members of an ibmr (ibmr->mapping). They either - * migrate to the kill_list, or have been cleaned and should be - * moved to the clean_list. - * For fastregs, they will be dynamically allocated, and - * will be destroyed by the unmap function. - */ - if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, - &kill_list, &unpinned); - /* If we've been asked to destroy all MRs, move those - * that were simply cleaned to the kill list */ - if (free_all) - list_splice_init(&unmap_list, &kill_list); - } - - /* Destroy any MRs that are past their best before date */ - list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { - rds_iw_stats_inc(s_iw_rdma_mr_free); - list_del(&ibmr->mapping.m_list); - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - nfreed++; - } - - /* Anything that remains are laundered ibmrs, which we can add - * back to the clean list. */ - if (!list_empty(&unmap_list)) { - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); - atomic_sub(nfreed, &pool->item_count); - - mutex_unlock(&pool->flush_lock); -} - -static void rds_iw_mr_pool_flush_worker(struct work_struct *work) -{ - struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); - - rds_iw_flush_mr_pool(pool, 0); -} - -void rds_iw_free_mr(void *trans_private, int invalidate) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; - - rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); - if (!pool) - return; - - /* Return it to the pool's free list */ - rds_iw_free_fastreg(pool, ibmr); - - /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); - - if (invalidate) { - if (likely(!in_interrupt())) { - rds_iw_flush_mr_pool(pool, 0); - } else { - /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); - } - } -} - -void rds_iw_flush_mrs(void) -{ - struct rds_iw_device *rds_iwdev; - - list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - if (pool) - rds_iw_flush_mr_pool(pool, 0); - } -} - -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret) -{ - struct rds_iw_device *rds_iwdev; - struct rds_iw_mr *ibmr = NULL; - struct rdma_cm_id *cm_id; - struct sockaddr_in src = { - .sin_addr.s_addr = rs->rs_bound_addr, - .sin_port = rs->rs_bound_port, - }; - struct sockaddr_in dst = { - .sin_addr.s_addr = rs->rs_conn_addr, - .sin_port = rs->rs_conn_port, - }; - int ret; - - ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); - if (ret || !cm_id) { - ret = -ENODEV; - goto out; - } - - if (!rds_iwdev->mr_pool) { - ret = -ENODEV; - goto out; - } - - ibmr = rds_iw_alloc_mr(rds_iwdev); - if (IS_ERR(ibmr)) - return ibmr; - - ibmr->cm_id = cm_id; - ibmr->device = rds_iwdev; - - ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->mr->rkey; - else - printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); - -out: - if (ret) { - if (ibmr) - rds_iw_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } - return ibmr; -} - -/* - * iWARP reg handling - * - * The life cycle of a fastreg registration is a bit different from - * FMRs. - * The idea behind fastreg is to have one MR, to which we bind different - * mappings over time. To avoid stalling on the expensive map and invalidate - * operations, these operations are pipelined on the same send queue on - * which we want to send the message containing the r_key. - * - * This creates a bit of a problem for us, as we do not have the destination - * IP in GET_MR, so the connection must be setup prior to the GET_MR call for - * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit - * will try to queue a LOCAL_INV (if needed) and a REG_MR work request - * before queuing the SEND. When completions for these arrive, they are - * dispatched to the MR has a bit set showing that RDMa can be performed. - * - * There is another interesting aspect that's related to invalidation. - * The application can request that a mapping is invalidated in FREE_MR. - * The expectation there is that this invalidation step includes ALL - * PREVIOUSLY FREED MRs. - */ -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct ib_mr *mr; - int err; - - mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, - pool->max_message_size); - if (IS_ERR(mr)) { - err = PTR_ERR(mr); - - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); - return err; - } - - ibmr->mr = mr; - return 0; -} - -static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping) -{ - struct rds_iw_mr *ibmr = mapping->m_mr; - struct rds_iw_scatterlist *m_sg = &mapping->m_sg; - struct ib_reg_wr reg_wr; - struct ib_send_wr *failed_wr; - int ret, n; - - n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE); - if (unlikely(n != m_sg->len)) - return n < 0 ? n : -EINVAL; - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = RDS_IW_REG_WR_ID; - reg_wr.wr.num_sge = 0; - reg_wr.mr = ibmr->mr; - reg_wr.key = mapping->m_rkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - - /* - * Perform a WR for the reg_mr. Each individual page - * in the sg list is added to the fast reg page list and placed - * inside the reg_mr WR. The key used is a rolling 8bit - * counter, which should guarantee uniqueness. - */ - ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); - mapping->m_rkey = ibmr->mr->rkey; - - failed_wr = ®_wr.wr; - ret = ib_post_send(ibmr->cm_id->qp, ®_wr.wr, &failed_wr); - BUG_ON(failed_wr != ®_wr.wr); - if (ret) - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - return ret; -} - -static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) -{ - struct ib_send_wr s_wr, *failed_wr; - int ret = 0; - - if (!ibmr->cm_id->qp || !ibmr->mr) - goto out; - - memset(&s_wr, 0, sizeof(s_wr)); - s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; - s_wr.opcode = IB_WR_LOCAL_INV; - s_wr.ex.invalidate_rkey = ibmr->mr->rkey; - s_wr.send_flags = IB_SEND_SIGNALED; - - failed_wr = &s_wr; - ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); - if (ret) { - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - goto out; - } -out: - return ret; -} - -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, - unsigned int sg_len) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct rds_iw_mapping *mapping = &ibmr->mapping; - u64 *dma_pages; - int ret = 0; - - rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); - - ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); - if (ret) { - dma_pages = NULL; - goto out; - } - - if (mapping->m_sg.dma_len > pool->max_message_size) { - ret = -EMSGSIZE; - goto out; - } - - ret = rds_iw_rdma_reg_mr(mapping); - if (ret) - goto out; - - rds_iw_stats_inc(s_iw_rdma_mr_used); - -out: - kfree(dma_pages); - - return ret; -} - -/* - * "Free" a fastreg MR. - */ -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - unsigned long flags; - int ret; - - if (!ibmr->mapping.m_sg.dma_len) - return; - - ret = rds_iw_rdma_fastreg_inv(ibmr); - if (ret) - return; - - /* Try to post the LOCAL_INV WR to the queue. */ - spin_lock_irqsave(&pool->list_lock, flags); - - list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); - atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); - atomic_inc(&pool->dirty_count); - - spin_unlock_irqrestore(&pool->list_lock, flags); -} - -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned) -{ - struct rds_iw_mapping *mapping, *next; - unsigned int ncleaned = 0; - LIST_HEAD(laundered); - - /* Batched invalidation of fastreg MRs. - * Why do we do it this way, even though we could pipeline unmap - * and remap? The reason is the application semantics - when the - * application requests an invalidation of MRs, it expects all - * previously released R_Keys to become invalid. - * - * If we implement MR reuse naively, we risk memory corruption - * (this has actually been observed). So the default behavior - * requires that a MR goes through an explicit unmap operation before - * we can reuse it again. - * - * We could probably improve on this a little, by allowing immediate - * reuse of a MR on the same socket (eg you could add small - * cache of unused MRs to strct rds_socket - GET_MR could grab one - * of these without requiring an explicit invalidate). - */ - while (!list_empty(unmap_list)) { - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - list_for_each_entry_safe(mapping, next, unmap_list, m_list) { - *unpinned += mapping->m_sg.len; - list_move(&mapping->m_list, &laundered); - ncleaned++; - } - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - /* Move all laundered mappings back to the unmap list. - * We do not kill any WRs right now - it doesn't seem the - * fastreg API has a max_remap limit. */ - list_splice_init(&laundered, unmap_list); - - return ncleaned; -} - -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - if (ibmr->mr) - ib_dereg_mr(ibmr->mr); -} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c deleted file mode 100644 index a66d1794b2d0..000000000000 --- a/net/rds/iw_recv.c +++ /dev/null @@ -1,904 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <rdma/rdma_cm.h> - -#include "rds.h" -#include "iw.h" - -static struct kmem_cache *rds_iw_incoming_slab; -static struct kmem_cache *rds_iw_frag_slab; -static atomic_t rds_iw_allocation = ATOMIC_INIT(0); - -static void rds_iw_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_iw_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page); - kmem_cache_free(rds_iw_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - -void rds_iw_recv_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_recv_work *recv; - u32 i; - - for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { - struct ib_sge *sge; - - recv->r_iwinc = NULL; - recv->r_frag = NULL; - - recv->r_wr.next = NULL; - recv->r_wr.wr_id = i; - recv->r_wr.sg_list = recv->r_sge; - recv->r_wr.num_sge = RDS_IW_RECV_SGE; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = 0; - sge->length = RDS_FRAG_SIZE; - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - } -} - -static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - if (recv->r_iwinc) { - rds_inc_put(&recv->r_iwinc->ii_inc); - recv->r_iwinc = NULL; - } - if (recv->r_frag) { - rds_iw_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_iw_frag_drop_page(recv->r_frag); - rds_iw_frag_free(recv->r_frag); - recv->r_frag = NULL; - } -} - -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) -{ - u32 i; - - for (i = 0; i < ic->i_recv_ring.w_nr; i++) - rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_iw_frag_drop_page(&ic->i_frag); -} - -static int rds_iw_recv_refill_one(struct rds_connection *conn, - struct rds_iw_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; - struct ib_sge *sge; - int ret = -ENOMEM; - - if (!recv->r_iwinc) { - if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { - rds_iw_stats_inc(s_iw_rx_alloc_limit); - goto out; - } - recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, - kptr_gfp); - if (!recv->r_iwinc) { - atomic_dec(&rds_iw_allocation); - goto out; - } - INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); - rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); - } - - if (!recv->r_frag) { - recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (!recv->r_frag) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; - } - - if (!ic->i_frag.f_page) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (!ic->i_frag.f_page) - goto out; - ic->i_frag.f_offset = 0; - } - - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; - - /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. - */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); - sge->length = sizeof(struct rds_header); - - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } - - ret = 0; -out: - return ret; -} - -/* - * This tries to allocate and post unused work requests after making sure that - * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. - * - * -1 is returned if posting fails due to temporary resource exhaustion. - */ -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_recv_work *recv; - struct ib_recv_wr *failed_wr; - unsigned int posted = 0; - int ret = 0; - u32 pos; - - while ((prefill || rds_conn_up(conn)) && - rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { - if (pos >= ic->i_recv_ring.w_nr) { - printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", - pos); - ret = -EINVAL; - break; - } - - recv = &ic->i_recvs[pos]; - ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); - if (ret) { - ret = -1; - break; - } - - /* XXX when can this fail? */ - ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); - rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, - recv->r_iwinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); - if (ret) { - rds_iw_conn_error(conn, "recv post on " - "%pI4 returned %d, disconnecting and " - "reconnecting\n", &conn->c_faddr, - ret); - ret = -1; - break; - } - - posted++; - } - - /* We're doing flow control - update the window. */ - if (ic->i_flowctl && posted) - rds_iw_advertise_credits(conn, posted); - - if (ret) - rds_iw_ring_unalloc(&ic->i_recv_ring, 1); - return ret; -} - -static void rds_iw_inc_purge(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); - - list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_iw_frag_drop_page(frag); - rds_iw_frag_free(frag); - } -} - -void rds_iw_inc_free(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - - rds_iw_inc_purge(inc); - rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); - BUG_ON(!list_empty(&iwinc->ii_frags)); - kmem_cache_free(rds_iw_incoming_slab, iwinc); - atomic_dec(&rds_iw_allocation); - BUG_ON(atomic_read(&rds_iw_allocation) < 0); -} - -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - unsigned long to_copy; - unsigned long frag_off = 0; - int copied = 0; - int ret; - u32 len; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - len = be32_to_cpu(inc->i_hdr.h_len); - - while (iov_iter_count(to) && copied < len) { - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - to_copy = min_t(unsigned long, iov_iter_count(to), - RDS_FRAG_SIZE - frag_off); - to_copy = min_t(unsigned long, to_copy, len - copied); - - /* XXX needs + offset for multiple recvs per page */ - rds_stats_add(s_copy_to_user, to_copy); - ret = copy_page_to_iter(frag->f_page, - frag->f_offset + frag_off, - to_copy, - to); - if (ret != to_copy) - return -EFAULT; - - frag_off += to_copy; - copied += to_copy; - } - - return copied; -} - -/* ic starts out kzalloc()ed */ -void rds_iw_recv_init_ack(struct rds_iw_connection *ic) -{ - struct ib_send_wr *wr = &ic->i_ack_wr; - struct ib_sge *sge = &ic->i_ack_sge; - - sge->addr = ic->i_ack_dma; - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); - - wr->sg_list = sge; - wr->num_sge = 1; - wr->opcode = IB_WR_SEND; - wr->wr_id = RDS_IW_ACK_WR_ID; - wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; -} - -/* - * You'd think that with reliable IB connections you wouldn't need to ack - * messages that have been received. The problem is that IB hardware generates - * an ack message before it has DMAed the message into memory. This creates a - * potential message loss if the HCA is disabled for any reason between when it - * sends the ack and before the message is DMAed and processed. This is only a - * potential issue if another HCA is available for fail-over. - * - * When the remote host receives our ack they'll free the sent message from - * their send queue. To decrease the latency of this we always send an ack - * immediately after we've received messages. - * - * For simplicity, we only have one ack in flight at a time. This puts - * pressure on senders to have deep enough send queues to absorb the latency of - * a single ack frame being in flight. This might not be good enough. - * - * This is implemented by have a long-lived send_wr and sge which point to a - * statically allocated ack frame. This ack wr does not fall under the ring - * accounting that the tx and rx wrs do. The QP attribute specifically makes - * room for it beyond the ring size. Send completion notices its special - * wr_id and avoids working with the ring in that case. - */ -#ifndef KERNEL_HAS_ATOMIC64 -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - unsigned long flags; - - spin_lock_irqsave(&ic->i_ack_lock, flags); - ic->i_ack_next = seq; - if (ack_required) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - spin_unlock_irqrestore(&ic->i_ack_lock, flags); -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - unsigned long flags; - u64 seq; - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - spin_lock_irqsave(&ic->i_ack_lock, flags); - seq = ic->i_ack_next; - spin_unlock_irqrestore(&ic->i_ack_lock, flags); - - return seq; -} -#else -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - atomic64_set(&ic->i_ack_next, seq); - if (ack_required) { - smp_mb__before_atomic(); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - } -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_atomic(); - - return atomic64_read(&ic->i_ack_next); -} -#endif - - -static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) -{ - struct rds_header *hdr = ic->i_ack; - struct ib_send_wr *failed_wr; - u64 seq; - int ret; - - seq = rds_iw_get_ack(ic); - - rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); - rds_message_populate_header(hdr, 0, 0, 0); - hdr->h_ack = cpu_to_be64(seq); - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - ic->i_ack_queued = jiffies; - - ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); - if (unlikely(ret)) { - /* Failed to send. Release the WR, and - * force another ACK. - */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - rds_iw_stats_inc(s_iw_ack_send_failure); - - rds_iw_conn_error(ic->conn, "sending ack failed\n"); - } else - rds_iw_stats_inc(s_iw_ack_sent); -} - -/* - * There are 3 ways of getting acknowledgements to the peer: - * 1. We call rds_iw_attempt_ack from the recv completion handler - * to send an ACK-only frame. - * However, there can be only one such frame in the send queue - * at any time, so we may have to postpone it. - * 2. When another (data) packet is transmitted while there's - * an ACK in the queue, we piggyback the ACK sequence number - * on the data packet. - * 3. If the ACK WR is done sending, we get called from the - * send queue completion handler, and check whether there's - * another ACK pending (postponed because the WR was on the - * queue). If so, we transmit it. - * - * We maintain 2 variables: - * - i_ack_flags, which keeps track of whether the ACK WR - * is currently in the send queue or not (IB_ACK_IN_FLIGHT) - * - i_ack_next, which is the last sequence number we received - * - * Potentially, send queue and receive queue handlers can run concurrently. - * It would be nice to not have to use a spinlock to synchronize things, - * but the one problem that rules this out is that 64bit updates are - * not atomic on all platforms. Things would be a lot simpler if - * we had atomic64 or maybe cmpxchg64 everywhere. - * - * Reconnecting complicates this picture just slightly. When we - * reconnect, we may be seeing duplicate packets. The peer - * is retransmitting them, because it hasn't seen an ACK for - * them. It is important that we ACK these. - * - * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with - * this flag set *MUST* be acknowledged immediately. - */ - -/* - * When we get here, we're called from the recv queue handler. - * Check whether we ought to transmit an ACK. - */ -void rds_iw_attempt_ack(struct rds_iw_connection *ic) -{ - unsigned int adv_credits; - - if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - return; - - if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { - rds_iw_stats_inc(s_iw_ack_send_delayed); - return; - } - - /* Can we get a send credit? */ - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { - rds_iw_stats_inc(s_iw_tx_throttle); - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - return; - } - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - rds_iw_send_ack(ic, adv_credits); -} - -/* - * We get here from the send completion handler, when the - * adapter tells us the ACK frame was sent. - */ -void rds_iw_ack_send_complete(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - rds_iw_attempt_ack(ic); -} - -/* - * This is called by the regular xmit code when it wants to piggyback - * an ACK on an outgoing frame. - */ -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) -{ - if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - rds_iw_stats_inc(s_iw_ack_send_piggybacked); - return rds_iw_get_ack(ic); -} - -/* - * It's kind of lame that we're copying from the posted receive pages into - * long-lived bitmaps. We could have posted the bitmaps and rdma written into - * them. But receiving new congestion bitmaps should be a *rare* event, so - * hopefully we won't need to invest that complexity in making it more - * efficient. By copying we can share a simpler core with TCP which has to - * copy. - */ -static void rds_iw_cong_recv(struct rds_connection *conn, - struct rds_iw_incoming *iwinc) -{ - struct rds_cong_map *map; - unsigned int map_off; - unsigned int map_page; - struct rds_page_frag *frag; - unsigned long frag_off; - unsigned long to_copy; - unsigned long copied; - uint64_t uncongested = 0; - void *addr; - - /* catch completely corrupt packets */ - if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) - return; - - map = conn->c_fcong; - map_page = 0; - map_off = 0; - - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - frag_off = 0; - - copied = 0; - - while (copied < RDS_CONG_MAP_BYTES) { - uint64_t *src, *dst; - unsigned int k; - - to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); - BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - - addr = kmap_atomic(frag->f_page); - - src = addr + frag_off; - dst = (void *)map->m_page_addrs[map_page] + map_off; - for (k = 0; k < to_copy; k += 8) { - /* Record ports that became uncongested, ie - * bits that changed from 0 to 1. */ - uncongested |= ~(*src) & *dst; - *dst++ = *src++; - } - kunmap_atomic(addr); - - copied += to_copy; - - map_off += to_copy; - if (map_off == PAGE_SIZE) { - map_off = 0; - map_page++; - } - - frag_off += to_copy; - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - } - - /* the congestion map is in little endian order */ - uncongested = le64_to_cpu(uncongested); - - rds_cong_map_updated(map, uncongested); -} - -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_iw_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - -static void rds_iw_process_recv(struct rds_connection *conn, - struct rds_iw_recv_work *recv, u32 byte_len, - struct rds_iw_ack_state *state) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_incoming *iwinc = ic->i_iwinc; - struct rds_header *ihdr, *hdr; - - /* XXX shut down the connection if port 0,0 are seen? */ - - rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, - byte_len); - - if (byte_len < sizeof(struct rds_header)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 didn't include a " - "header, disconnecting and " - "reconnecting\n", - &conn->c_faddr); - return; - } - byte_len -= sizeof(struct rds_header); - - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; - - /* Validate the checksum. */ - if (!rds_message_verify_checksum(ihdr)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 has corrupted header - " - "forcing a reconnect\n", - &conn->c_faddr); - rds_stats_inc(s_recv_drop_bad_checksum); - return; - } - - /* Process the ACK sequence which comes with every packet */ - state->ack_recv = be64_to_cpu(ihdr->h_ack); - state->ack_recv_valid = 1; - - /* Process the credits update if there was one */ - if (ihdr->h_credit) - rds_iw_send_add_credits(conn, ihdr->h_credit); - - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { - /* This is an ACK-only packet. The fact that it gets - * special treatment here is that historically, ACKs - * were rather special beasts. - */ - rds_iw_stats_inc(s_iw_ack_received); - - /* - * Usually the frags make their way on to incs and are then freed as - * the inc is freed. We don't go that route, so we have to drop the - * page ref ourselves. We can't just leave the page on the recv - * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. - * - * FIXME: Fold this into the code path below. - */ - rds_iw_frag_drop_page(recv->r_frag); - return; - } - - /* - * If we don't already have an inc on the connection then this - * fragment has a header and starts a message.. copy its header - * into the inc and save the inc so we can hang upcoming fragments - * off its list. - */ - if (!iwinc) { - iwinc = recv->r_iwinc; - recv->r_iwinc = NULL; - ic->i_iwinc = iwinc; - - hdr = &iwinc->ii_inc.i_hdr; - memcpy(hdr, ihdr, sizeof(*hdr)); - ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); - - rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, - ic->i_recv_data_rem, hdr->h_flags); - } else { - hdr = &iwinc->ii_inc.i_hdr; - /* We can't just use memcmp here; fragments of a - * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence || - hdr->h_len != ihdr->h_len || - hdr->h_sport != ihdr->h_sport || - hdr->h_dport != ihdr->h_dport) { - rds_iw_conn_error(conn, - "fragment header mismatch; forcing reconnect\n"); - return; - } - } - - list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); - recv->r_frag = NULL; - - if (ic->i_recv_data_rem > RDS_FRAG_SIZE) - ic->i_recv_data_rem -= RDS_FRAG_SIZE; - else { - ic->i_recv_data_rem = 0; - ic->i_iwinc = NULL; - - if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) - rds_iw_cong_recv(conn, iwinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &iwinc->ii_inc, GFP_ATOMIC); - state->ack_next = be64_to_cpu(hdr->h_sequence); - state->ack_next_valid = 1; - } - - /* Evaluate the ACK_REQUIRED flag *after* we received - * the complete frame, and after bumping the next_rx - * sequence. */ - if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { - rds_stats_inc(s_recv_ack_required); - state->ack_required = 1; - } - - rds_inc_put(&iwinc->ii_inc); - } -} - -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_iw_stats_inc(s_iw_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_iw_connection *ic, - struct rds_iw_ack_state *state) -{ - struct rds_connection *conn = ic->conn; - struct ib_wc wc; - struct rds_iw_recv_work *recv; - - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_rx_cq_event); - - recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; - - rds_iw_recv_unmap_page(ic, recv); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { - /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_iw_process_recv(conn, recv, wc.byte_len, state); - } else { - rds_iw_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } - } - - rds_iw_ring_free(&ic->i_recv_ring, 1); - } -} - -void rds_iw_recv_tasklet_fn(unsigned long data) -{ - struct rds_iw_connection *ic = (struct rds_iw_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_iw_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - - if (state.ack_next_valid) - rds_iw_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; - } - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - /* If we ever end up with a really empty receive ring, we're - * in deep trouble, as the sender will definitely see RNR - * timeouts. */ - if (rds_iw_ring_empty(&ic->i_recv_ring)) - rds_iw_stats_inc(s_iw_rx_ring_empty); - - /* - * If the ring is running low, then schedule the thread to refill. - */ - if (rds_iw_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); -} - -int rds_iw_recv(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int ret = 0; - - rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_iw_stats_inc(s_iw_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - return ret; -} - -int rds_iw_recv_init(void) -{ - struct sysinfo si; - int ret = -ENOMEM; - - /* Default to 30% of all available RAM for recv memory */ - si_meminfo(&si); - rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; - - rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", - sizeof(struct rds_iw_incoming), - 0, 0, NULL); - if (!rds_iw_incoming_slab) - goto out; - - rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", - sizeof(struct rds_page_frag), - 0, 0, NULL); - if (!rds_iw_frag_slab) - kmem_cache_destroy(rds_iw_incoming_slab); - else - ret = 0; -out: - return ret; -} - -void rds_iw_recv_exit(void) -{ - kmem_cache_destroy(rds_iw_incoming_slab); - kmem_cache_destroy(rds_iw_frag_slab); -} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c deleted file mode 100644 index da8e3b63f663..000000000000 --- a/net/rds/iw_ring.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> - -#include "rds.h" -#include "iw.h" - -/* - * Locking for IB rings. - * We assume that allocation is always protected by a mutex - * in the caller (this is a valid assumption for the current - * implementation). - * - * Freeing always happens in an interrupt, and hence only - * races with allocations, but not with other free()s. - * - * The interaction between allocation and freeing is that - * the alloc code has to determine the number of free entries. - * To this end, we maintain two counters; an allocation counter - * and a free counter. Both are allowed to run freely, and wrap - * around. - * The number of used entries is always (alloc_ctr - free_ctr) % NR. - * - * The current implementation makes free_ctr atomic. When the - * caller finds an allocation fails, it should set an "alloc fail" - * bit and retry the allocation. The "alloc fail" bit essentially tells - * the CQ completion handlers to wake it up after freeing some - * more entries. - */ - -/* - * This only happens on shutdown. - */ -DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); - -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) -{ - memset(ring, 0, sizeof(*ring)); - ring->w_nr = nr; - rdsdebug("ring %p nr %u\n", ring, ring->w_nr); -} - -static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) -{ - u32 diff; - - /* This assumes that atomic_t has at least as many bits as u32 */ - diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); - BUG_ON(diff > ring->w_nr); - - return diff; -} - -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) -{ - /* We only ever get called from the connection setup code, - * prior to creating the QP. */ - BUG_ON(__rds_iw_ring_used(ring)); - ring->w_nr = nr; -} - -static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) == 0; -} - -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) -{ - u32 ret = 0, avail; - - avail = ring->w_nr - __rds_iw_ring_used(ring); - - rdsdebug("ring %p val %u next %u free %u\n", ring, val, - ring->w_alloc_ptr, avail); - - if (val && avail) { - ret = min(val, avail); - *pos = ring->w_alloc_ptr; - - ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; - ring->w_alloc_ctr += ret; - } - - return ret; -} - -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; - atomic_add(val, &ring->w_free_ctr); - - if (__rds_iw_ring_empty(ring) && - waitqueue_active(&rds_iw_ring_empty_wait)) - wake_up(&rds_iw_ring_empty_wait); -} - -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; - ring->w_alloc_ctr -= val; -} - -int rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_empty(ring); -} - -int rds_iw_ring_low(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); -} - - -/* - * returns the oldest alloced ring entry. This will be the next one - * freed. This can't be called if there are none allocated. - */ -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) -{ - return ring->w_free_ptr; -} - -/* - * returns the number of completed work requests. - */ - -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) -{ - u32 ret; - - if (oldest <= (unsigned long long)wr_id) - ret = (unsigned long long)wr_id - oldest + 1; - else - ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; - - rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, - wr_id, oldest); - return ret; -} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c deleted file mode 100644 index e20bd503f4bd..000000000000 --- a/net/rds/iw_send.c +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/device.h> -#include <linux/dmapool.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -static void rds_iw_send_rdma_complete(struct rds_message *rm, - int wc_status) -{ - int notify_status; - - switch (wc_status) { - case IB_WC_WR_FLUSH_ERR: - return; - - case IB_WC_SUCCESS: - notify_status = RDS_RDMA_SUCCESS; - break; - - case IB_WC_REM_ACCESS_ERR: - notify_status = RDS_RDMA_REMOTE_ERROR; - break; - - default: - notify_status = RDS_RDMA_OTHER_ERROR; - break; - } - rds_rdma_send_complete(rm, notify_status); -} - -static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rm_rdma_op *op) -{ - if (op->op_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->op_mapped = 0; - } -} - -static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, - int wc_status) -{ - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.op_sg, rm->data.op_nents, - DMA_TO_DEVICE); - - if (rm->rdma.op_active) { - rds_iw_send_unmap_rdma(ic, &rm->rdma); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_iw_send_rdma_complete(rm, wc_status); - - if (rm->rdma.op_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); - } - - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - - rds_message_put(rm); - send->s_rm = NULL; -} - -void rds_iw_send_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - struct ib_sge *sge; - - send->s_rm = NULL; - send->s_op = NULL; - send->s_mapping = NULL; - - send->s_send_wr.next = NULL; - send->s_send_wr.wr_id = i; - send->s_send_wr.sg_list = send->s_sge; - send->s_send_wr.num_sge = 1; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.send_flags = 0; - send->s_send_wr.ex.imm_data = 0; - - sge = rds_iw_data_sge(ic, send->s_sge); - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, send->s_sge); - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - - send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, - fastreg_message_size); - if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); - break; - } - } -} - -void rds_iw_send_clear_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - BUG_ON(!send->s_mr); - ib_dereg_mr(send->s_mr); - if (send->s_send_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_iw_send_unmap_rdma(ic, send->s_op); - } -} - -/* - * The _oldest/_free ring operations here race cleanly with the alloc/unalloc - * operations performed in the send path. As the sender allocs and potentially - * unallocs the next free entry in the ring it doesn't alter which is - * the next to be freed, which is what this is concerned with. - */ -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_wc wc; - struct rds_iw_send_work *send; - u32 completed; - u32 oldest; - u32 i; - int ret; - - rdsdebug("cq %p conn %p\n", cq, conn); - rds_iw_stats_inc(s_iw_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_tx_cq_event); - - if (wc.status != IB_WC_SUCCESS) { - printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); - break; - } - - if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { - ic->i_fastreg_posted = 0; - continue; - } - - if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) { - ic->i_fastreg_posted = 1; - continue; - } - - if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - rds_iw_ack_send_complete(ic); - continue; - } - - oldest = rds_iw_ring_oldest(&ic->i_send_ring); - - completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); - - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_send_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_REG_MR: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - case IB_WR_RDMA_READ_WITH_INV: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - printk_ratelimited(KERN_NOTICE - "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_send_wr.opcode); - break; - } - - send->s_send_wr.opcode = 0xdead; - send->s_send_wr.num_sge = 1; - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) - rds_iw_send_rdma_complete(rm, wc.status); - } - - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } - - rds_iw_ring_free(&ic->i_send_ring, completed); - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_iw_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); - } - } -} - -/* - * This is the main function for allocating credits when sending - * messages. - * - * Conceptually, we have two counters: - * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the receiver's queue. For - * each SEND WR we post, we decrement this by one. - * - * - posted credits: this tells us how many WRs we recently - * posted to the receive queue. This value is transferred - * to the peer as a "credit update" in a RDS header field. - * Every time we transmit credits to the peer, we subtract - * the amount of transferred credits from this counter. - * - * It is essential that we avoid situations where both sides have - * exhausted their send credits, and are unable to send new credits - * to the peer. We achieve this by requiring that we send at least - * one credit update to the peer before exhausting our credits. - * When new credits arrive, we subtract one credit that is withheld - * until we've posted new buffers and are ready to transmit these - * credits (see rds_iw_send_add_credits below). - * - * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. - * However, the ACK sending code is independent and can race with - * message SENDs. - * - * In the send path, we need to update the counters for send credits - * and the counter of posted buffers atomically - when we use the - * last available credit, we cannot allow another thread to race us - * and grab the posted credits counter. Hence, we have to use a - * spinlock to protect the credit counter, or use atomics. - * - * Spinlocks shared between the send and the receive path are bad, - * because they create unnecessary delays. An early implementation - * using a spinlock showed a 5% degradation in throughput at some - * loads. - * - * This implementation avoids spinlocks completely, putting both - * counters into a single atomic, and updating that atomic using - * atomic_add (in the receive path, when receiving fresh credits), - * and using atomic_cmpxchg when updating the two counters. - */ -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted, int max_posted) -{ - unsigned int avail, posted, got = 0, advertise; - long oldval, newval; - - *adv_credits = 0; - if (!ic->i_flowctl) - return wanted; - -try_again: - advertise = 0; - oldval = newval = atomic_read(&ic->i_credits); - posted = IB_GET_POST_CREDITS(oldval); - avail = IB_GET_SEND_CREDITS(oldval); - - rdsdebug("wanted=%u credits=%u posted=%u\n", - wanted, avail, posted); - - /* The last credit must be used to send a credit update. */ - if (avail && !posted) - avail--; - - if (avail < wanted) { - struct rds_connection *conn = ic->i_cm_id->context; - - /* Oops, there aren't that many credits left! */ - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - got = avail; - } else { - /* Sometimes you get what you want, lalala. */ - got = wanted; - } - newval -= IB_SET_SEND_CREDITS(got); - - /* - * If need_posted is non-zero, then the caller wants - * the posted regardless of whether any send credits are - * available. - */ - if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, max_posted); - newval -= IB_SET_POST_CREDITS(advertise); - } - - /* Finally bill everything */ - if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) - goto try_again; - - *adv_credits = advertise; - return got; -} - -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (credits == 0) - return; - - rdsdebug("credits=%u current=%u%s\n", - credits, - IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), - test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); - - atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); - - rds_iw_stats_inc(s_iw_rx_credit_updates); -} - -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (posted == 0) - return; - - atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); - - /* Decide whether to send an update to the peer now. - * If we would send a credit update for every single buffer we - * post, we would end up with an ACK storm (ACK arrives, - * consumes buffer, we refill the ring, send ACK to remote - * advertising the newly posted buffer... ad inf) - * - * Performance pretty much depends on how often we send - * credit updates - too frequent updates mean lots of ACKs. - * Too infrequent updates, and the peer will run out of - * credits and has to throttle. - * For the time being, 16 seems to be a good compromise. - */ - if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); -} - -static inline void -rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) -{ - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_send_wr.send_flags = send_flags; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.num_sge = 2; - send->s_send_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_iw_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = rds_iw_local_dma_lkey(ic); - - sge = rds_iw_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_send_wr.num_sge = 1; - sge = &send->s_sge[0]; - } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); -} - -/* - * This can be called multiple times for a given message. The first time - * we see a message we map its scatterlist into the IB device so that - * we can provide that mapped address to the IB scatter gather entries - * in the IB work requests. We translate the scatterlist into a series - * of work requests that fragment the message. These work requests complete - * in order so we pass ownership of the message to the completion handler - * once we send the final fragment. - * - * The RDS core uses the c_send_lock to only enter this function once - * per connection. This makes sure that the tx ring alloc/unalloc pairs - * don't get out of sync and confuse the ring. - */ -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct scatterlist *scat; - u32 pos; - u32 i; - u32 work_alloc; - u32 credit_alloc; - u32 posted; - u32 adv_credits = 0; - int send_flags = 0; - int sent; - int ret; - int flow_controlled = 0; - - BUG_ON(off % RDS_FRAG_SIZE); - BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); - - /* Fastreg support */ - if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { - ret = -EAGAIN; - goto out; - } - - /* FIXME we may overallocate here */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) - i = 1; - else - i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - credit_alloc = work_alloc; - if (ic->i_flowctl) { - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); - adv_credits += posted; - if (credit_alloc < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); - work_alloc = credit_alloc; - flow_controlled++; - } - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_throttle); - ret = -ENOMEM; - goto out; - } - } - - /* map the message the first time we see it */ - if (!ic->i_rm) { - /* - printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->data.op_nents) { - rm->data.op_count = ib_dma_map_sg(dev, - rm->data.op_sg, - rm->data.op_nents, - DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); - if (rm->data.op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - } else { - rm->data.op_count = 0; - } - - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - rds_message_addref(rm); - rm->data.op_dmasg = 0; - rm->data.op_dmaoff = 0; - ic->i_rm = rm; - - /* Finalize the header */ - if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; - if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; - - /* If it has a RDMA op, tell the peer we did it. This is - * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.op_active) { - struct rds_ext_header_rdma ext_hdr; - - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); - rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); - } - if (rm->m_rdma_cookie) { - rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, - rds_rdma_cookie_key(rm->m_rdma_cookie), - rds_rdma_cookie_offset(rm->m_rdma_cookie)); - } - - /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so - * we should not do this unless we have a chance of at least - * sticking the header into the send ring. Which is why we - * should call rds_iw_ring_alloc first. */ - rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); - rds_message_make_checksum(&rm->m_inc.i_hdr); - - /* - * Update adv_credits since we reset the ACK_REQUIRED bit. - */ - rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); - } - - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->data.op_sg[rm->data.op_dmasg]; - sent = 0; - i = 0; - - /* Sometimes you want to put a fence between an RDMA - * READ and the following SEND. - * We could either do this all the time - * or when requested by the user. Right now, we let - * the application choose. - */ - if (rm->rdma.op_active && rm->rdma.op_fence) - send_flags = IB_SEND_FENCE; - - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ - - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } - - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { - unsigned int len; - - send = &ic->i_sends[pos]; - - len = min(RDS_FRAG_SIZE, - ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); - rds_iw_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len, - send_flags); - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - /* - * Always signal the last one if we're stopping due to flow control. - */ - if (flow_controlled && i == (work_alloc-1)) - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next); - - sent += len; - rm->data.op_dmaoff += len; - if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { - scat++; - rm->data.op_dmaoff = 0; - rm->data.op_dmasg++; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - /* add credit and redo the header checksum */ - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - adv_credits = 0; - rds_iw_stats_inc(s_iw_tx_credit_updates); - } - - if (prev) - prev->s_send_wr.next = &send->s_send_wr; - prev = send; - - pos = (pos + 1) % ic->i_send_ring.w_nr; - } - - /* Account the RDS header in the number of bytes we sent, but just once. - * The caller has no concept of fragmentation. */ - if (hdr_off == 0) - sent += sizeof(struct rds_header); - - /* if we finished the message then send completion owns it */ - if (scat == &rm->data.op_sg[rm->data.op_count]) { - prev->s_rm = ic->i_rm; - prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; - } - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - if (ic->i_flowctl && i < credit_alloc) - rds_iw_send_add_credits(conn, credit_alloc - i); - - /* XXX need to worry about failed_wr and partial sends. */ - failed_wr = &first->s_send_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_send_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_send_wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; - } - goto out; - } - - ret = sent; -out: - BUG_ON(adv_credits); - return ret; -} - -static int rds_iw_build_send_reg(struct rds_iw_send_work *send, - struct scatterlist *sg, - int sg_nents) -{ - int n; - - n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE); - if (unlikely(n != sg_nents)) - return n < 0 ? n : -EINVAL; - - send->s_reg_wr.wr.opcode = IB_WR_REG_MR; - send->s_reg_wr.wr.wr_id = 0; - send->s_reg_wr.wr.num_sge = 0; - send->s_reg_wr.mr = send->s_mr; - send->s_reg_wr.key = send->s_mr->rkey; - send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE; - - ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); - - return 0; -} - -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct rds_iw_device *rds_iwdev; - struct scatterlist *scat; - unsigned long len; - u64 remote_addr = op->op_remote_addr; - u32 pos, fr_pos; - u32 work_alloc; - u32 i; - u32 j; - int sent; - int ret; - int num_sge; - int sg_nents; - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - - /* map the message the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - - op->op_mapped = 1; - } - - if (!op->op_write) { - /* Alloc space on the send queue for the fastreg */ - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); - if (work_alloc != 1) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - } - - /* - * Instead of knowing how to return a partial rdma read/write we insist that there - * be enough work requests to send the entire message. - */ - i = ceil(op->op_count, rds_iwdev->max_sge); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc != i) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - send = &ic->i_sends[pos]; - if (!op->op_write) { - first = prev = &ic->i_sends[fr_pos]; - } else { - first = send; - prev = NULL; - } - scat = &op->op_sg[0]; - sent = 0; - num_sge = op->op_count; - sg_nents = 0; - - for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { - send->s_rdma_wr.wr.send_flags = 0; - send->s_queued = jiffies; - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - } - - /* To avoid the need to have the plumbing to invalidate the fastreg_mr used - * for local access after RDS is finished with it, using - * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. - */ - if (op->op_write) - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; - else - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - - send->s_rdma_wr.remote_addr = remote_addr; - send->s_rdma_wr.rkey = op->op_rkey; - send->s_op = op; - - if (num_sge > rds_iwdev->max_sge) { - send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge; - num_sge -= rds_iwdev->max_sge; - } else - send->s_rdma_wr.wr.num_sge = num_sge; - - send->s_rdma_wr.wr.next = NULL; - - if (prev) - prev->s_send_wr.next = &send->s_rdma_wr.wr; - - for (j = 0; j < send->s_rdma_wr.wr.num_sge && - scat != &op->op_sg[op->op_count]; j++) { - len = ib_sg_dma_len(ic->i_cm_id->device, scat); - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) - sg_nents++; - else { - send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); - send->s_sge[j].length = len; - send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); - } - - sent += len; - rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); - remote_addr += len; - - scat++; - } - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) { - send->s_rdma_wr.wr.num_sge = 1; - send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; - send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; - send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; - } - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_rdma_wr, - send->s_rdma_wr.wr.num_sge, - send->s_rdma_wr.wr.next); - - prev = send; - if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) - send = ic->i_sends; - } - - /* if we finished the message then send completion owns it */ - if (scat == &op->op_sg[op->op_count]) - first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - - /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not - * recommended. Putting the lkey on the wire is a security hole, as it can - * allow for memory access to all of memory on the remote system. Some - * adapters do not allow using the lkey for this at all. To bypass this use a - * fastreg_mr (or possibly a dma_mr) - */ - if (!op->op_write) { - ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos], - &op->op_sg[0], sg_nents); - if (ret) { - printk(KERN_WARNING "RDS/IW: failed to reg send mem\n"); - goto out; - } - work_alloc++; - } - - failed_wr = &first->s_rdma_wr.wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_rdma_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_rdma_wr.wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - goto out; - } - -out: - return ret; -} - -void rds_iw_xmit_complete(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* We may have a pending ACK or window update we were unable - * to send previously (due to flow control). Try again. */ - rds_iw_attempt_ack(ic); -} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c deleted file mode 100644 index 5fe67f6a1d80..000000000000 --- a/net/rds/iw_stats.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/percpu.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> - -#include "rds.h" -#include "iw.h" - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); - -static const char *const rds_iw_stat_names[] = { - "iw_connect_raced", - "iw_listen_closed_stale", - "iw_tx_cq_call", - "iw_tx_cq_event", - "iw_tx_ring_full", - "iw_tx_throttle", - "iw_tx_sg_mapping_failure", - "iw_tx_stalled", - "iw_tx_credit_updates", - "iw_rx_cq_call", - "iw_rx_cq_event", - "iw_rx_ring_empty", - "iw_rx_refill_from_cq", - "iw_rx_refill_from_thread", - "iw_rx_alloc_limit", - "iw_rx_credit_updates", - "iw_ack_sent", - "iw_ack_send_failure", - "iw_ack_send_delayed", - "iw_ack_send_piggybacked", - "iw_ack_received", - "iw_rdma_mr_alloc", - "iw_rdma_mr_free", - "iw_rdma_mr_used", - "iw_rdma_mr_pool_flush", - "iw_rdma_mr_pool_wait", - "iw_rdma_mr_pool_depleted", -}; - -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail) -{ - struct rds_iw_statistics stats = {0, }; - uint64_t *src; - uint64_t *sum; - size_t i; - int cpu; - - if (avail < ARRAY_SIZE(rds_iw_stat_names)) - goto out; - - for_each_online_cpu(cpu) { - src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); - sum = (uint64_t *)&stats; - for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) - *(sum++) += *(src++); - } - - rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, - ARRAY_SIZE(rds_iw_stat_names)); -out: - return ARRAY_SIZE(rds_iw_stat_names); -} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c deleted file mode 100644 index 139239d2cb22..000000000000 --- a/net/rds/iw_sysctl.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> - -#include "iw.h" - -static struct ctl_table_header *rds_iw_sysctl_hdr; - -unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; -unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; -unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; -static unsigned long rds_iw_sysctl_max_wr_min = 1; -/* hardware will fail CQ creation long before this */ -static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; - -unsigned long rds_iw_sysctl_max_unsig_wrs = 16; -static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; - -unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; - -unsigned int rds_iw_sysctl_flow_control = 1; - -static struct ctl_table rds_iw_sysctl_table[] = { - { - .procname = "max_send_wr", - .data = &rds_iw_sysctl_max_send_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_recv_wr", - .data = &rds_iw_sysctl_max_recv_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_unsignaled_wr", - .data = &rds_iw_sysctl_max_unsig_wrs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_wr_min, - .extra2 = &rds_iw_sysctl_max_unsig_wr_max, - }, - { - .procname = "max_unsignaled_bytes", - .data = &rds_iw_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, - .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, - }, - { - .procname = "max_recv_allocation", - .data = &rds_iw_sysctl_max_recv_allocation, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "flow_control", - .data = &rds_iw_sysctl_flow_control, - .maxlen = sizeof(rds_iw_sysctl_flow_control), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -void rds_iw_sysctl_exit(void) -{ - unregister_net_sysctl_table(rds_iw_sysctl_hdr); -} - -int rds_iw_sysctl_init(void) -{ - rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); - if (!rds_iw_sysctl_hdr) - return -ENOMEM; - return 0; -} diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9c1fed81bf0f..7220bebcf558 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, event->event, rdma_event_msg(event->event)); - if (cm_id->device->node_type == RDMA_NODE_RNIC) - trans = &rds_iw_transport; - else + if (cm_id->device->node_type == RDMA_NODE_IB_CA) trans = &rds_ib_transport; /* Prevent shutdown from tearing down the connection @@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rds_conn_drop(conn); break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + if (conn) { + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); + } + break; + default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", @@ -200,10 +206,6 @@ static int rds_rdma_init(void) if (ret) goto out; - ret = rds_iw_init(); - if (ret) - goto err_iw_init; - ret = rds_ib_init(); if (ret) goto err_ib_init; @@ -211,8 +213,6 @@ static int rds_rdma_init(void) goto out; err_ib_init: - rds_iw_exit(); -err_iw_init: rds_rdma_listen_stop(); out: return ret; @@ -224,11 +224,10 @@ static void rds_rdma_exit(void) /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); rds_ib_exit(); - rds_iw_exit(); } module_exit(rds_rdma_exit); MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); -MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_DESCRIPTION("RDS: IB transport"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index faba4e382695..ff2010e9d20c 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); void rds_ib_exit(void); -/* from iw.c */ -extern struct rds_transport rds_iw_transport; -int rds_iw_init(void); -void rds_iw_exit(void); - #endif diff --git a/net/rds/rds.h b/net/rds/rds.h index 0e2797bdc316..80256b08eac0 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -222,6 +222,7 @@ struct rds_incoming { __be32 i_saddr; rds_rdma_cookie_t i_rdma_cookie; + struct timeval i_rx_tstamp; }; struct rds_mr { diff --git a/net/rds/recv.c b/net/rds/recv.c index a00462b0d01d..c0be1ecd11c9 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -35,6 +35,8 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/time.h> +#include <linux/rds.h> #include "rds.h" @@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_conn = conn; inc->i_saddr = saddr; inc->i_rdma_cookie = 0; + inc->i_rx_tstamp.tv_sec = 0; + inc->i_rx_tstamp.tv_usec = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + if (sock_flag(sk, SOCK_RCVTSTAMP)) + do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); @@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) /* * Receive any control messages. */ -static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, + struct rds_sock *rs) { int ret = 0; @@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) return ret; } + if ((inc->i_rx_tstamp.tv_sec != 0) && + sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { + ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(struct timeval), + &inc->i_rx_tstamp); + if (ret) + return ret; + } + return 0; } @@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, msg->msg_flags |= MSG_TRUNC; } - if (rds_cmsg_recv(inc, msg)) { + if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; goto out; } diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig index 598d374f6a35..868f1ad0415a 100644 --- a/net/rfkill/Kconfig +++ b/net/rfkill/Kconfig @@ -41,5 +41,4 @@ config RFKILL_GPIO default n help If you say yes here you get support of a generic gpio RFKILL - driver. The platform should fill in the appropriate fields in the - rfkill_gpio_platform_data structure and pass that to the driver. + driver. diff --git a/net/rfkill/core.c b/net/rfkill/core.c index cf5b69ab1829..03f26e3a6f48 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -57,6 +57,8 @@ struct rfkill { bool registered; bool persistent; + bool polling_paused; + bool suspended; const struct rfkill_ops *ops; void *data; @@ -233,29 +235,6 @@ static void rfkill_event(struct rfkill *rfkill) rfkill_send_events(rfkill, RFKILL_OP_CHANGE); } -static bool __rfkill_set_hw_state(struct rfkill *rfkill, - bool blocked, bool *change) -{ - unsigned long flags; - bool prev, any; - - BUG_ON(!rfkill); - - spin_lock_irqsave(&rfkill->lock, flags); - prev = !!(rfkill->state & RFKILL_BLOCK_HW); - if (blocked) - rfkill->state |= RFKILL_BLOCK_HW; - else - rfkill->state &= ~RFKILL_BLOCK_HW; - *change = prev != blocked; - any = !!(rfkill->state & RFKILL_BLOCK_ANY); - spin_unlock_irqrestore(&rfkill->lock, flags); - - rfkill_led_trigger_event(rfkill); - - return any; -} - /** * rfkill_set_block - wrapper for set_block method * @@ -285,7 +264,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_lock_irqsave(&rfkill->lock, flags); prev = rfkill->state & RFKILL_BLOCK_SW; - if (rfkill->state & RFKILL_BLOCK_SW) + if (prev) rfkill->state |= RFKILL_BLOCK_SW_PREV; else rfkill->state &= ~RFKILL_BLOCK_SW_PREV; @@ -303,8 +282,8 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_lock_irqsave(&rfkill->lock, flags); if (err) { /* - * Failed -- reset status to _prev, this may be different - * from what set set _PREV to earlier in this function + * Failed -- reset status to _PREV, which may be different + * from what we have set _PREV to earlier in this function * if rfkill_set_sw_state was invoked. */ if (rfkill->state & RFKILL_BLOCK_SW_PREV) @@ -323,6 +302,19 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) rfkill_event(rfkill); } +static void rfkill_update_global_state(enum rfkill_type type, bool blocked) +{ + int i; + + if (type != RFKILL_TYPE_ALL) { + rfkill_global_states[type].cur = blocked; + return; + } + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; +} + #ifdef CONFIG_RFKILL_INPUT static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); @@ -332,8 +324,7 @@ static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); * @blocked: the new state * * This function sets the state of all switches of given type, - * unless a specific switch is claimed by userspace (in which case, - * that switch is left alone) or suspended. + * unless a specific switch is suspended. * * Caller must have acquired rfkill_global_mutex. */ @@ -341,15 +332,7 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - if (type == RFKILL_TYPE_ALL) { - int i; - - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = blocked; - } else { - rfkill_global_states[type].cur = blocked; - } - + rfkill_update_global_state(type, blocked); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; @@ -477,17 +460,28 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type) } #endif - bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) { - bool ret, change; + unsigned long flags; + bool ret, prev; + + BUG_ON(!rfkill); - ret = __rfkill_set_hw_state(rfkill, blocked, &change); + spin_lock_irqsave(&rfkill->lock, flags); + prev = !!(rfkill->state & RFKILL_BLOCK_HW); + if (blocked) + rfkill->state |= RFKILL_BLOCK_HW; + else + rfkill->state &= ~RFKILL_BLOCK_HW; + ret = !!(rfkill->state & RFKILL_BLOCK_ANY); + spin_unlock_irqrestore(&rfkill->lock, flags); + + rfkill_led_trigger_event(rfkill); if (!rfkill->registered) return ret; - if (change) + if (prev != blocked) schedule_work(&rfkill->uevent_work); return ret; @@ -582,6 +576,34 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) } EXPORT_SYMBOL(rfkill_set_states); +static const char * const rfkill_types[] = { + NULL, /* RFKILL_TYPE_ALL */ + "wlan", + "bluetooth", + "ultrawideband", + "wimax", + "wwan", + "gps", + "fm", + "nfc", +}; + +enum rfkill_type rfkill_find_type(const char *name) +{ + int i; + + BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES); + + if (!name) + return RFKILL_TYPE_ALL; + + for (i = 1; i < NUM_RFKILL_TYPES; i++) + if (!strcmp(name, rfkill_types[i])) + return i; + return RFKILL_TYPE_ALL; +} +EXPORT_SYMBOL(rfkill_find_type); + static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -591,38 +613,12 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(name); -static const char *rfkill_get_type_str(enum rfkill_type type) -{ - BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1); - - switch (type) { - case RFKILL_TYPE_WLAN: - return "wlan"; - case RFKILL_TYPE_BLUETOOTH: - return "bluetooth"; - case RFKILL_TYPE_UWB: - return "ultrawideband"; - case RFKILL_TYPE_WIMAX: - return "wimax"; - case RFKILL_TYPE_WWAN: - return "wwan"; - case RFKILL_TYPE_GPS: - return "gps"; - case RFKILL_TYPE_FM: - return "fm"; - case RFKILL_TYPE_NFC: - return "nfc"; - default: - BUG(); - } -} - static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type)); + return sprintf(buf, "%s\n", rfkill_types[rfkill->type]); } static DEVICE_ATTR_RO(type); @@ -730,20 +726,12 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(state); -static ssize_t claim_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", 0); -} -static DEVICE_ATTR_RO(claim); - static struct attribute *rfkill_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_type.attr, &dev_attr_index.attr, &dev_attr_persistent.attr, &dev_attr_state.attr, - &dev_attr_claim.attr, &dev_attr_soft.attr, &dev_attr_hard.attr, NULL, @@ -768,7 +756,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) if (error) return error; error = add_uevent_var(env, "RFKILL_TYPE=%s", - rfkill_get_type_str(rfkill->type)); + rfkill_types[rfkill->type]); if (error) return error; spin_lock_irqsave(&rfkill->lock, flags); @@ -786,6 +774,7 @@ void rfkill_pause_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; + rfkill->polling_paused = true; cancel_delayed_work_sync(&rfkill->poll_work); } EXPORT_SYMBOL(rfkill_pause_polling); @@ -797,6 +786,11 @@ void rfkill_resume_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; + rfkill->polling_paused = false; + + if (rfkill->suspended) + return; + queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); } @@ -807,7 +801,8 @@ static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); - rfkill_pause_polling(rfkill); + rfkill->suspended = true; + cancel_delayed_work_sync(&rfkill->poll_work); return 0; } @@ -817,12 +812,16 @@ static int rfkill_resume(struct device *dev) struct rfkill *rfkill = to_rfkill(dev); bool cur; + rfkill->suspended = false; + if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); } - rfkill_resume_polling(rfkill); + if (rfkill->ops->poll && !rfkill->polling_paused) + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, 0); return 0; } @@ -1164,15 +1163,8 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, mutex_lock(&rfkill_global_mutex); - if (ev.op == RFKILL_OP_CHANGE_ALL) { - if (ev.type == RFKILL_TYPE_ALL) { - enum rfkill_type i; - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = ev.soft; - } else { - rfkill_global_states[ev.type].cur = ev.soft; - } - } + if (ev.op == RFKILL_OP_CHANGE_ALL) + rfkill_update_global_state(ev.type, ev.soft); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL) @@ -1261,10 +1253,8 @@ static struct miscdevice rfkill_miscdev = { static int __init rfkill_init(void) { int error; - int i; - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = !rfkill_default_state; + rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state); error = class_register(&rfkill_class); if (error) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 4b1e3f35f06c..76c01cbd56e3 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -27,8 +27,6 @@ #include <linux/acpi.h> #include <linux/gpio/consumer.h> -#include <linux/rfkill-gpio.h> - struct rfkill_gpio_data { const char *name; enum rfkill_type type; @@ -81,7 +79,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev, if (!id) return -ENODEV; - rfkill->name = dev_name(dev); rfkill->type = (unsigned)id->driver_data; return acpi_dev_add_driver_gpios(ACPI_COMPANION(dev), @@ -90,24 +87,27 @@ static int rfkill_gpio_acpi_probe(struct device *dev, static int rfkill_gpio_probe(struct platform_device *pdev) { - struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data; struct rfkill_gpio_data *rfkill; struct gpio_desc *gpio; + const char *type_name; int ret; rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL); if (!rfkill) return -ENOMEM; + device_property_read_string(&pdev->dev, "name", &rfkill->name); + device_property_read_string(&pdev->dev, "type", &type_name); + + if (!rfkill->name) + rfkill->name = dev_name(&pdev->dev); + + rfkill->type = rfkill_find_type(type_name); + if (ACPI_HANDLE(&pdev->dev)) { ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill); if (ret) return ret; - } else if (pdata) { - rfkill->name = pdata->name; - rfkill->type = pdata->type; - } else { - return -ENODEV; } rfkill->clk = devm_clk_get(&pdev->dev, NULL); @@ -124,10 +124,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->shutdown_gpio = gpio; - /* Make sure at-least one of the GPIO is defined and that - * a name is specified for this instance - */ - if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) { + /* Make sure at-least one GPIO is defined for this instance */ + if (!rfkill->reset_gpio && !rfkill->shutdown_gpio) { dev_err(&pdev->dev, "invalid platform data\n"); return -EINVAL; } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7e2d1057d8bc..9d935fa5a2a9 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -37,7 +37,7 @@ static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; /* local epoch for detecting local-end reset */ -__be32 rxrpc_epoch; +u32 rxrpc_epoch; /* current debugging ID */ atomic_t rxrpc_debug_id; @@ -81,6 +81,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, int len) { + unsigned int tail; + if (len < sizeof(struct sockaddr_rxrpc)) return -EINVAL; @@ -103,9 +105,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, _debug("INET: %x @ %pI4", ntohs(srx->transport.sin.sin_port), &srx->transport.sin.sin_addr); - if (srx->transport_len > 8) - memset((void *)&srx->transport + 8, 0, - srx->transport_len - 8); + tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); break; case AF_INET6: @@ -113,6 +113,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, return -EAFNOSUPPORT; } + if (tail < len) + memset((void *)srx + tail, 0, len - tail); return 0; } @@ -121,11 +123,10 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, */ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) { - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr; + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct sock *sk = sock->sk; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; - __be16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -143,7 +144,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) memcpy(&rx->srx, srx, sizeof(rx->srx)); - /* find a local transport endpoint if we don't have one already */ + /* Find or create a local transport endpoint to use */ local = rxrpc_lookup_local(&rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); @@ -152,14 +153,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) rx->local = local; if (srx->srx_service) { - service_id = htons(srx->srx_service); write_lock_bh(&local->services_lock); list_for_each_entry(prx, &local->services, listen_link) { - if (prx->service_id == service_id) + if (prx->srx.srx_service == srx->srx_service) goto service_in_use; } - rx->service_id = service_id; list_add_tail(&rx->listen_link, &local->services); write_unlock_bh(&local->services_lock); @@ -276,7 +275,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_transport *trans; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - __be16 service_id; _enter(",,%x,%lx", key_serial(key), user_call_ID); @@ -299,16 +297,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, atomic_inc(&trans->usage); } - service_id = rx->service_id; - if (srx) - service_id = htons(srx->srx_service); - + if (!srx) + srx = &rx->srx; if (!key) key = rx->key; if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ - bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp); + bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp); if (IS_ERR(bundle)) { call = ERR_CAST(bundle); goto out; @@ -324,7 +320,6 @@ out_notrans: _leave(" = %p", call); return call; } - EXPORT_SYMBOL(rxrpc_kernel_begin_call); /** @@ -340,7 +335,6 @@ void rxrpc_kernel_end_call(struct rxrpc_call *call) rxrpc_remove_user_ID(call->socket, call); rxrpc_put_call(call); } - EXPORT_SYMBOL(rxrpc_kernel_end_call); /** @@ -425,7 +419,6 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, } rx->trans = trans; - rx->service_id = htons(srx->srx_service); rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; release_sock(&rx->sk); @@ -622,7 +615,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; - /* we support transport protocol UDP only */ + /* we support transport protocol UDP/UDP6 only */ if (protocol != PF_INET) return -EPROTONOSUPPORT; @@ -754,7 +747,7 @@ static int rxrpc_release(struct socket *sock) * RxRPC network protocol */ static const struct proto_ops rxrpc_rpc_ops = { - .family = PF_UNIX, + .family = PF_RXRPC, .owner = THIS_MODULE, .release = rxrpc_release, .bind = rxrpc_bind, @@ -778,7 +771,7 @@ static struct proto rxrpc_proto = { .name = "RXRPC", .owner = THIS_MODULE, .obj_size = sizeof(struct rxrpc_sock), - .max_header = sizeof(struct rxrpc_header), + .max_header = sizeof(struct rxrpc_wire_header), }; static const struct net_proto_family rxrpc_family_ops = { @@ -796,7 +789,7 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); - rxrpc_epoch = htonl(get_seconds()); + rxrpc_epoch = get_seconds(); ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c index 6d79310fcaae..277731a5e67a 100644 --- a/net/rxrpc/ar-accept.c +++ b/net/rxrpc/ar-accept.c @@ -27,7 +27,7 @@ * generate a connection-level abort */ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, - struct rxrpc_header *hdr) + struct rxrpc_wire_header *whdr) { struct msghdr msg; struct kvec iov[1]; @@ -36,25 +36,21 @@ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, _enter("%d,,", local->debug_id); + whdr->type = RXRPC_PACKET_TYPE_BUSY; + whdr->serial = htonl(1); + msg.msg_name = &srx->transport.sin; msg.msg_namelen = sizeof(srx->transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->seq = 0; - hdr->type = RXRPC_PACKET_TYPE_BUSY; - hdr->flags = 0; - hdr->userStatus = 0; - hdr->_rsvd = 0; - - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + iov[0].iov_base = whdr; + iov[0].iov_len = sizeof(*whdr); len = iov[0].iov_len; - hdr->serial = htonl(1); - _proto("Tx BUSY %%%u", ntohl(hdr->serial)); + _proto("Tx BUSY %%1"); ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); if (ret < 0) { @@ -185,8 +181,8 @@ invalid_service: read_unlock_bh(&local->services_lock); read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) { + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { rxrpc_get_call(call); rxrpc_queue_call(call); } @@ -211,8 +207,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work) struct rxrpc_skb_priv *sp; struct sockaddr_rxrpc srx; struct rxrpc_sock *rx; + struct rxrpc_wire_header whdr; struct sk_buff *skb; - __be16 service_id; int ret; _enter("%d", local->debug_id); @@ -240,6 +236,19 @@ process_next_packet: sp = rxrpc_skb(skb); + /* Set up a response packet header in case we need it */ + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = 0; + whdr.flags = 0; + whdr.type = 0; + whdr.userStatus = 0; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + /* determine the remote address */ memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -256,10 +265,9 @@ process_next_packet: } /* get the socket providing the service */ - service_id = sp->hdr.serviceId; read_lock_bh(&local->services_lock); list_for_each_entry(rx, &local->services, listen_link) { - if (rx->service_id == service_id && + if (rx->srx.srx_service == sp->hdr.serviceId && rx->sk.sk_state != RXRPC_CLOSE) goto found_service; } @@ -267,7 +275,7 @@ process_next_packet: goto invalid_service; found_service: - _debug("found service %hd", ntohs(rx->service_id)); + _debug("found service %hd", rx->srx.srx_service); if (sk_acceptq_is_full(&rx->sk)) goto backlog_full; sk_acceptq_added(&rx->sk); @@ -296,7 +304,7 @@ found_service: backlog_full: read_unlock_bh(&local->services_lock); busy: - rxrpc_busy(local, &srx, &sp->hdr); + rxrpc_busy(local, &srx, &whdr); rxrpc_free_skb(skb); goto process_next_packet; @@ -379,7 +387,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, rb_insert_color(&call->sock_node, &rx->calls); if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) BUG(); - if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events)) + if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) BUG(); rxrpc_queue_call(call); @@ -395,7 +403,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, out_release: _debug("release %p", call); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); out_discard: write_unlock_bh(&call->state_lock); @@ -407,7 +415,7 @@ out: } /* - * handle rejectance of a call by userspace + * Handle rejection of a call by userspace * - reject the call at the front of the queue */ int rxrpc_reject_call(struct rxrpc_sock *rx) @@ -434,7 +442,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: call->state = RXRPC_CALL_SERVER_BUSY; - if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) + if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) rxrpc_queue_call(call); ret = 0; goto out_release; @@ -458,7 +466,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) out_release: _debug("release %p", call); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); out_discard: write_unlock_bh(&call->state_lock); @@ -487,7 +495,6 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, _leave(" = %p", call); return call; } - EXPORT_SYMBOL(rxrpc_kernel_accept_call); /** @@ -506,5 +513,4 @@ int rxrpc_kernel_reject_call(struct socket *sock) _leave(" = %d", ret); return ret; } - EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index adc555e0323d..16d967075eaf 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -23,7 +23,7 @@ * How long to wait before scheduling ACK generation after seeing a * packet with RXRPC_REQUEST_ACK set (in jiffies). */ -unsigned rxrpc_requested_ack_delay = 1; +unsigned int rxrpc_requested_ack_delay = 1; /* * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). @@ -32,7 +32,7 @@ unsigned rxrpc_requested_ack_delay = 1; * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned rxrpc_soft_ack_delay = 1 * HZ; +unsigned int rxrpc_soft_ack_delay = 1 * HZ; /* * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). @@ -41,7 +41,7 @@ unsigned rxrpc_soft_ack_delay = 1 * HZ; * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned rxrpc_idle_ack_delay = 0.5 * HZ; +unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; /* * Receive window size in packets. This indicates the maximum number of @@ -49,19 +49,19 @@ unsigned rxrpc_idle_ack_delay = 0.5 * HZ; * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further * packets. */ -unsigned rxrpc_rx_window_size = 32; +unsigned int rxrpc_rx_window_size = 32; /* * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = 5692; /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 4; static const char *rxrpc_acks(u8 reason) { @@ -91,7 +91,7 @@ static const s8 rxrpc_ack_priority[] = { * propose an ACK be sent */ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - __be32 serial, bool immediate) + u32 serial, bool immediate) { unsigned long expiry; s8 prior = rxrpc_ack_priority[ack_reason]; @@ -99,8 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, ASSERTCMP(prior, >, 0); _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), ntohl(serial), - immediate); + call->debug_id, rxrpc_acks(ack_reason), serial, immediate); if (prior < rxrpc_ack_priority[call->ackr_reason]) { if (immediate) @@ -139,7 +138,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, expiry = rxrpc_requested_ack_delay; if (!expiry) goto cancel_timer; - if (!immediate || serial == cpu_to_be32(1)) { + if (!immediate || serial == 1) { _debug("run defer timer"); goto run_timer; } @@ -157,11 +156,11 @@ run_timer: return; cancel_timer: - _debug("cancel timer %%%u", ntohl(serial)); + _debug("cancel timer %%%u", serial); try_to_del_timer_sync(&call->ack_timer); read_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -170,7 +169,7 @@ cancel_timer: * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - __be32 serial, bool immediate) + u32 serial, bool immediate) { s8 prior = rxrpc_ack_priority[ack_reason]; @@ -193,7 +192,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, if (resend & 1) { _debug("SET RESEND"); - set_bit(RXRPC_CALL_RESEND, &call->events); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); } if (resend & 2) { @@ -203,7 +202,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, } else { _debug("KILL RESEND TIMER"); del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); } read_unlock_bh(&call->state_lock); @@ -214,8 +213,8 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, */ static void rxrpc_resend(struct rxrpc_call *call) { + struct rxrpc_wire_header *whdr; struct rxrpc_skb_priv *sp; - struct rxrpc_header *hdr; struct sk_buff *txb; unsigned long *p_txb, resend_at; bool stop; @@ -247,14 +246,13 @@ static void rxrpc_resend(struct rxrpc_call *call) sp->need_resend = false; /* each Tx packet has a new serial number */ - sp->hdr.serial = - htonl(atomic_inc_return(&call->conn->serial)); + sp->hdr.serial = atomic_inc_return(&call->conn->serial); - hdr = (struct rxrpc_header *) txb->head; - hdr->serial = sp->hdr.serial; + whdr = (struct rxrpc_wire_header *)txb->head; + whdr->serial = htonl(sp->hdr.serial); _proto("Tx DATA %%%u { #%d }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + sp->hdr.serial, sp->hdr.seq); if (rxrpc_send_packet(call->conn->trans, txb) < 0) { stop = true; sp->resend_at = jiffies + 3; @@ -428,7 +426,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) int tail = call->acks_tail, old_tail; int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); - _enter("{%u,%u},%u", call->acks_hard, win, hard); + kenter("{%u,%u},%u", call->acks_hard, win, hard); ASSERTCMP(hard - call->acks_hard, <=, win); @@ -478,11 +476,11 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) sp = rxrpc_skb(skb); _debug("drain OOS packet %d [%d]", - ntohl(sp->hdr.seq), call->rx_first_oos); + sp->hdr.seq, call->rx_first_oos); - if (ntohl(sp->hdr.seq) != call->rx_first_oos) { + if (sp->hdr.seq != call->rx_first_oos) { skb_queue_head(&call->rx_oos_queue, skb); - call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; _debug("requeue %p {%u}", skb, call->rx_first_oos); } else { skb->mark = RXRPC_SKB_MARK_DATA; @@ -496,8 +494,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) /* find out what the next packet is */ skb = skb_peek(&call->rx_oos_queue); if (skb) - call->rx_first_oos = - ntohl(rxrpc_skb(skb)->hdr.seq); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; else call->rx_first_oos = 0; _debug("peek %p {%u}", skb, call->rx_first_oos); @@ -522,7 +519,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, u32 seq; sp = rxrpc_skb(skb); - seq = ntohl(sp->hdr.seq); + seq = sp->hdr.seq; _enter(",,{%u}", seq); skb->destructor = rxrpc_packet_destructor; @@ -535,9 +532,8 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, skb_queue_walk(&call->rx_oos_queue, p) { psp = rxrpc_skb(p); - if (ntohl(psp->hdr.seq) > seq) { - _debug("insert oos #%u before #%u", - seq, ntohl(psp->hdr.seq)); + if (psp->hdr.seq > seq) { + _debug("insert oos #%u before #%u", seq, psp->hdr.seq); skb_insert(p, skb, &call->rx_oos_queue); goto inserted; } @@ -555,7 +551,7 @@ inserted: if (call->state < RXRPC_CALL_COMPLETE && call->rx_data_post == call->rx_first_oos) { _debug("drain rx oos now"); - set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); } read_unlock(&call->state_lock); @@ -586,7 +582,7 @@ static void rxrpc_zap_tx_window(struct rxrpc_call *call) skb = (struct sk_buff *) _skb; sp = rxrpc_skb(skb); - _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + _debug("+++ clear Tx %u", sp->hdr.seq); rxrpc_free_skb(skb); } @@ -657,8 +653,7 @@ process_further: /* data packets that wind up here have been received out of * order, need security processing or are jumbo packets */ case RXRPC_PACKET_TYPE_DATA: - _proto("OOSQ DATA %%%u { #%u }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); /* secured packets must be verified and possibly decrypted */ if (rxrpc_verify_packet(call, skb, _abort_code) < 0) @@ -676,7 +671,7 @@ process_further: if (!skb_pull(skb, sizeof(ack))) BUG(); - latest = ntohl(sp->hdr.serial); + latest = sp->hdr.serial; hard = ntohl(ack.firstPacket); tx = atomic_read(&call->sequence); @@ -793,7 +788,7 @@ all_acked: del_timer_sync(&call->resend_timer); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); if (call->acks_window) rxrpc_zap_tx_window(call); @@ -881,16 +876,17 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); + struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; struct rxrpc_ackinfo ackinfo; - struct rxrpc_header hdr; struct msghdr msg; struct kvec iov[5]; + enum rxrpc_call_event genbit; unsigned long bits; __be32 data, pad; size_t len; - int genbit, loop, nbit, ioc, ret, mtu; - u32 abort_code = RX_PROTOCOL_ERROR; + int loop, nbit, ioc, ret, mtu; + u32 serial, abort_code = RX_PROTOCOL_ERROR; u8 *acks = NULL; //printk("\n--------------------\n"); @@ -911,33 +907,33 @@ void rxrpc_process_call(struct work_struct *work) msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = call->conn->epoch; - hdr.cid = call->cid; - hdr.callNumber = call->call_id; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_ACK; - hdr.flags = call->conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = call->conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = call->conn->service_id; + whdr.epoch = htonl(call->conn->epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ACK; + whdr.flags = call->conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = call->conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(call->service_id); memset(iov, 0, sizeof(iov)); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_RELEASE, &call->events)) { + if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { rxrpc_release_call(call); - clear_bit(RXRPC_CALL_RELEASE, &call->events); + clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); } - if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) { + if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { int error; - clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); - clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); error = call->conn->trans->peer->net_error; _debug("post net error %d", error); @@ -945,47 +941,47 @@ void rxrpc_process_call(struct work_struct *work) if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, error, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); goto kill_ACKs; } - if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); _debug("post conn abort"); if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, call->conn->error, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); goto kill_ACKs; } - if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) { - hdr.type = RXRPC_PACKET_TYPE_BUSY; - genbit = RXRPC_CALL_REJECT_BUSY; + if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { + whdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_EV_REJECT_BUSY; goto send_message; } - if (test_bit(RXRPC_CALL_ABORT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, ECONNABORTED, true) < 0) goto no_mem; - hdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.type = RXRPC_PACKET_TYPE_ABORT; data = htonl(call->abort_code); iov[1].iov_base = &data; iov[1].iov_len = sizeof(data); - genbit = RXRPC_CALL_ABORT; + genbit = RXRPC_CALL_EV_ABORT; goto send_message; } - if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) { - genbit = RXRPC_CALL_ACK_FINAL; + if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_EV_ACK_FINAL; ack.bufferSpace = htons(8); ack.maxSkew = 0; @@ -995,9 +991,9 @@ void rxrpc_process_call(struct work_struct *work) call->ackr_reason = 0; spin_lock_bh(&call->lock); - ack.serial = call->ackr_serial; - ack.previousPacket = call->ackr_prev_seq; - ack.firstPacket = htonl(call->rx_data_eaten + 1); + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); spin_unlock_bh(&call->lock); pad = 0; @@ -1011,12 +1007,12 @@ void rxrpc_process_call(struct work_struct *work) goto send_ACK; } - if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) | - (1 << RXRPC_CALL_RCVD_ABORT)) + if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | + (1 << RXRPC_CALL_EV_RCVD_ABORT)) ) { u32 mark; - if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events)) + if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) mark = RXRPC_SKB_MARK_REMOTE_ABORT; else mark = RXRPC_SKB_MARK_BUSY; @@ -1026,22 +1022,22 @@ void rxrpc_process_call(struct work_struct *work) if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events); - clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); goto kill_ACKs; } - if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { _debug("do implicit ackall"); rxrpc_clear_tx_window(call); } - if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) { + if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { write_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_TIMEOUT; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); } write_unlock_bh(&call->state_lock); @@ -1050,7 +1046,7 @@ void rxrpc_process_call(struct work_struct *work) ETIME, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); goto kill_ACKs; } @@ -1071,13 +1067,13 @@ void rxrpc_process_call(struct work_struct *work) } /* handle resending */ - if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_resend_timer(call); - if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_resend(call); /* consider sending an ordinary ACK */ - if (test_bit(RXRPC_CALL_ACK, &call->events)) { + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { _debug("send ACK: window: %d - %d { %lx }", call->rx_data_eaten, call->ackr_win_top, call->ackr_window[0]); @@ -1085,11 +1081,11 @@ void rxrpc_process_call(struct work_struct *work) if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { /* ACK by sending reply DATA packet in this state */ - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); goto maybe_reschedule; } - genbit = RXRPC_CALL_ACK; + genbit = RXRPC_CALL_EV_ACK; acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, GFP_NOFS); @@ -1099,13 +1095,11 @@ void rxrpc_process_call(struct work_struct *work) //hdr.flags = RXRPC_SLOW_START_OK; ack.bufferSpace = htons(8); ack.maxSkew = 0; - ack.serial = 0; - ack.reason = 0; spin_lock_bh(&call->lock); - ack.reason = call->ackr_reason; - ack.serial = call->ackr_serial; - ack.previousPacket = call->ackr_prev_seq; + ack.reason = call->ackr_reason; + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); ack.firstPacket = htonl(call->rx_data_eaten + 1); ack.nAcks = 0; @@ -1152,7 +1146,7 @@ void rxrpc_process_call(struct work_struct *work) /* handle completion of security negotiations on an incoming * connection */ - if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { _debug("secured"); spin_lock_bh(&call->lock); @@ -1160,7 +1154,7 @@ void rxrpc_process_call(struct work_struct *work) _debug("securing"); write_lock(&call->conn->lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_bit(RXRPC_CALL_RELEASE, &call->events)) { + !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { _debug("not released"); call->state = RXRPC_CALL_SERVER_ACCEPTING; list_move_tail(&call->accept_link, @@ -1169,39 +1163,39 @@ void rxrpc_process_call(struct work_struct *work) write_unlock(&call->conn->lock); read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); read_unlock(&call->state_lock); } spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) + if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) goto maybe_reschedule; } /* post a notification of an acceptable connection to the app */ - if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { _debug("post accept"); if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, 0, false) < 0) goto no_mem; - clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); goto maybe_reschedule; } /* handle incoming call acceptance */ - if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { _debug("accepted"); ASSERTCMP(call->rx_data_post, ==, 0); call->rx_data_post = 1; read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); read_unlock_bh(&call->state_lock); } /* drain the out of sequence received packet queue into the packet Rx * queue */ - if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { while (call->rx_data_post == call->rx_first_oos) if (rxrpc_drain_rx_oos_queue(call) < 0) break; @@ -1224,9 +1218,10 @@ send_ACK: ackinfo.rxMTU = htonl(rxrpc_rx_mtu); ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); - hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - ntohl(hdr.serial), + serial, ntohs(ack.maxSkew), ntohl(ack.firstPacket), ntohl(ack.previousPacket), @@ -1242,8 +1237,9 @@ send_ACK: send_message: _debug("send message"); - hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); - _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial)); + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); send_message_2: len = iov[0].iov_len; @@ -1280,12 +1276,12 @@ send_message_2: } switch (genbit) { - case RXRPC_CALL_ABORT: + case RXRPC_CALL_EV_ABORT: clear_bit(genbit, &call->events); - clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); goto kill_ACKs; - case RXRPC_CALL_ACK_FINAL: + case RXRPC_CALL_EV_ACK_FINAL: write_lock_bh(&call->state_lock); if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) call->state = RXRPC_CALL_COMPLETE; @@ -1310,9 +1306,9 @@ send_message_2: kill_ACKs: del_timer_sync(&call->ack_timer); - if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) rxrpc_put_call(call); - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); maybe_reschedule: if (call->events || !skb_queue_empty(&call->rx_queue)) { @@ -1326,12 +1322,11 @@ maybe_reschedule: if (call->state >= RXRPC_CALL_COMPLETE && !list_empty(&call->accept_link)) { _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", - call, call->events, call->flags, - ntohl(call->conn->cid)); + call, call->events, call->flags, call->conn->cid); read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -1345,7 +1340,7 @@ error: * this means there's a race between clearing the flag and setting the * work pending bit and the work item being processed again */ if (call->events && !work_pending(&call->processor)) { - _debug("jumpstart %x", ntohl(call->conn->cid)); + _debug("jumpstart %x", call->conn->cid); rxrpc_queue_call(call); } diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index a9e05db0f5d5..7c8d300ade9b 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -21,14 +21,14 @@ /* * Maximum lifetime of a call (in jiffies). */ -unsigned rxrpc_max_call_lifetime = 60 * HZ; +unsigned int rxrpc_max_call_lifetime = 60 * HZ; /* * Time till dead call expires after last use (in jiffies). */ -unsigned rxrpc_dead_call_expiry = 2 * HZ; +unsigned int rxrpc_dead_call_expiry = 2 * HZ; -const char *const rxrpc_call_states[] = { +const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", @@ -64,11 +64,11 @@ static DEFINE_HASHTABLE(rxrpc_call_hash, 10); * Hash function for rxrpc_call_hash */ static unsigned long rxrpc_call_hashfunc( - u8 clientflag, - __be32 cid, - __be32 call_id, - __be32 epoch, - __be16 service_id, + u8 in_clientflag, + u32 cid, + u32 call_id, + u32 epoch, + u16 service_id, sa_family_t proto, void *localptr, unsigned int addr_size, @@ -77,7 +77,6 @@ static unsigned long rxrpc_call_hashfunc( const u16 *p; unsigned int i; unsigned long key; - u32 hcid = ntohl(cid); _enter(""); @@ -85,12 +84,12 @@ static unsigned long rxrpc_call_hashfunc( /* We just want to add up the __be32 values, so forcing the * cast should be okay. */ - key += (__force u32)epoch; - key += (__force u16)service_id; - key += (__force u32)call_id; - key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; - key += hcid & RXRPC_CHANNELMASK; - key += clientflag; + key += epoch; + key += service_id; + key += call_id; + key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; + key += cid & RXRPC_CHANNELMASK; + key += in_clientflag; key += proto; /* Step through the peer address in 16-bit portions for speed */ for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) @@ -148,19 +147,16 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call) * isn't there. */ struct rxrpc_call *rxrpc_find_call_hash( - u8 clientflag, - __be32 cid, - __be32 call_id, - __be32 epoch, - __be16 service_id, + struct rxrpc_host_header *hdr, void *localptr, sa_family_t proto, - const u8 *peer_addr) + const void *peer_addr) { unsigned long key; unsigned int addr_size = 0; struct rxrpc_call *call = NULL; struct rxrpc_call *ret = NULL; + u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; _enter(""); switch (proto) { @@ -174,20 +170,21 @@ struct rxrpc_call *rxrpc_find_call_hash( break; } - key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch, - service_id, proto, localptr, addr_size, + key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, + hdr->epoch, hdr->serviceId, + proto, localptr, addr_size, peer_addr); hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { if (call->hash_key == key && - call->call_id == call_id && - call->cid == cid && - call->in_clientflag == clientflag && - call->service_id == service_id && + call->call_id == hdr->callNumber && + call->cid == hdr->cid && + call->in_clientflag == in_clientflag && + call->service_id == hdr->serviceId && call->proto == proto && call->local == localptr && memcmp(call->peer_ip.ipv6_addr, peer_addr, - addr_size) == 0 && - call->epoch == epoch) { + addr_size) == 0 && + call->epoch == hdr->epoch) { ret = call; break; } @@ -414,12 +411,12 @@ found_extant_second: */ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_connection *conn, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, gfp_t gfp) { struct rxrpc_call *call, *candidate; struct rb_node **p, *parent; - __be32 call_id; + u32 call_id; _enter(",%d,,%x", conn->debug_id, gfp); @@ -433,7 +430,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, candidate->conn = conn; candidate->cid = hdr->cid; candidate->call_id = hdr->callNumber; - candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK; + candidate->channel = hdr->cid & RXRPC_CHANNELMASK; candidate->rx_data_post = 0; candidate->state = RXRPC_CALL_SERVER_ACCEPTING; if (conn->security_ix > 0) @@ -452,7 +449,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) rxrpc_queue_call(call); case RXRPC_CALL_REMOTELY_ABORTED: read_unlock(&call->state_lock); @@ -492,9 +489,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* The tree is sorted in order of the __be32 value without * turning it into host order. */ - if ((__force u32)call_id < (__force u32)call->call_id) + if (call_id < call->call_id) p = &(*p)->rb_left; - else if ((__force u32)call_id > (__force u32)call->call_id) + else if (call_id > call->call_id) p = &(*p)->rb_right; else goto old_call; @@ -686,7 +683,7 @@ void rxrpc_release_call(struct rxrpc_call *call) _debug("+++ ABORTING STATE %d +++\n", call->state); call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_DEAD; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); @@ -714,8 +711,7 @@ void rxrpc_release_call(struct rxrpc_call *call) _debug("- zap %s %%%u #%u", rxrpc_pkts[sp->hdr.type], - ntohl(sp->hdr.serial), - ntohl(sp->hdr.seq)); + sp->hdr.serial, sp->hdr.seq); rxrpc_free_skb(skb); spin_lock_bh(&call->lock); } @@ -763,10 +759,10 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call) _debug("abort call %p", call); call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_DEAD; - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) sched = true; } - if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) sched = true; if (sched) rxrpc_queue_call(call); @@ -873,9 +869,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) unsigned long _skb; _skb = call->acks_window[call->acks_tail] & ~1; - sp = rxrpc_skb((struct sk_buff *) _skb); - _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); - rxrpc_free_skb((struct sk_buff *) _skb); + sp = rxrpc_skb((struct sk_buff *)_skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb((struct sk_buff *)_skb); call->acks_tail = (call->acks_tail + 1) & (call->acks_winsz - 1); } @@ -975,7 +971,7 @@ static void rxrpc_call_life_expired(unsigned long _call) _enter("{%d}", call->debug_id); read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { - set_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); rxrpc_queue_call(call); } read_unlock_bh(&call->state_lock); @@ -995,7 +991,7 @@ static void rxrpc_resend_time_expired(unsigned long _call) return; clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_queue_call(call); } @@ -1013,7 +1009,7 @@ static void rxrpc_ack_time_expired(unsigned long _call) read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 6c71ed1caf16..9942da1edbf6 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -21,7 +21,7 @@ /* * Time till a connection expires after last use (in seconds). */ -unsigned rxrpc_connection_expiry = 10 * 60; +unsigned int rxrpc_connection_expiry = 10 * 60; static void rxrpc_connection_reaper(struct work_struct *work); @@ -57,10 +57,10 @@ static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) */ static inline int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, - struct key *key, __be16 service_id) + struct key *key, u16 service_id) { return (bundle->service_id - service_id) ?: - ((unsigned long) bundle->key - (unsigned long) key); + ((unsigned long)bundle->key - (unsigned long)key); } /* @@ -69,14 +69,14 @@ int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, struct rxrpc_transport *trans, struct key *key, - __be16 service_id, + u16 service_id, gfp_t gfp) { struct rxrpc_conn_bundle *bundle, *candidate; struct rb_node *p, *parent, **pp; _enter("%p{%x},%x,%hx,", - rx, key_serial(key), trans->debug_id, ntohs(service_id)); + rx, key_serial(key), trans->debug_id, service_id); if (rx->trans == trans && rx->bundle) { atomic_inc(&rx->bundle->usage); @@ -213,7 +213,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->avail_calls = RXRPC_MAXCALLS; conn->size_align = 4; - conn->header_size = sizeof(struct rxrpc_header); + conn->header_size = sizeof(struct rxrpc_wire_header); } _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); @@ -230,7 +230,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) struct rxrpc_connection *xconn; struct rb_node *parent, **p; __be32 epoch; - u32 real_conn_id; + u32 cid; _enter(""); @@ -241,7 +241,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) conn->trans->conn_idcounter += RXRPC_CID_INC; if (conn->trans->conn_idcounter < RXRPC_CID_INC) conn->trans->conn_idcounter = RXRPC_CID_INC; - real_conn_id = conn->trans->conn_idcounter; + cid = conn->trans->conn_idcounter; attempt_insertion: parent = NULL; @@ -255,9 +255,9 @@ attempt_insertion: p = &(*p)->rb_left; else if (epoch > xconn->epoch) p = &(*p)->rb_right; - else if (real_conn_id < xconn->real_conn_id) + else if (cid < xconn->cid) p = &(*p)->rb_left; - else if (real_conn_id > xconn->real_conn_id) + else if (cid > xconn->cid) p = &(*p)->rb_right; else goto id_exists; @@ -268,20 +268,19 @@ attempt_insertion: rb_link_node(&conn->node, parent, p); rb_insert_color(&conn->node, &conn->trans->client_conns); - conn->real_conn_id = real_conn_id; - conn->cid = htonl(real_conn_id); + conn->cid = cid; write_unlock_bh(&conn->trans->conn_lock); - _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid)); + _leave(" [CID %x]", cid); return; /* we found a connection with the proposed ID - walk the tree from that * point looking for the next unused ID */ id_exists: for (;;) { - real_conn_id += RXRPC_CID_INC; - if (real_conn_id < RXRPC_CID_INC) { - real_conn_id = RXRPC_CID_INC; - conn->trans->conn_idcounter = real_conn_id; + cid += RXRPC_CID_INC; + if (cid < RXRPC_CID_INC) { + cid = RXRPC_CID_INC; + conn->trans->conn_idcounter = cid; goto attempt_insertion; } @@ -291,7 +290,7 @@ id_exists: xconn = rb_entry(parent, struct rxrpc_connection, node); if (epoch < xconn->epoch || - real_conn_id < xconn->real_conn_id) + cid < xconn->cid) goto attempt_insertion; } } @@ -334,7 +333,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, */ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, struct rxrpc_transport *trans, - __be16 service_id, + u16 service_id, struct rxrpc_call *call, gfp_t gfp) { @@ -404,11 +403,11 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | htonl(chan); - call->call_id = htonl(++conn->call_counter); + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, ntohl(call->call_id)); + conn->debug_id, chan, call->call_id); spin_unlock(&trans->client_lock); @@ -593,11 +592,11 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | htonl(chan); - call->call_id = htonl(++conn->call_counter); + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, ntohl(call->call_id)); + conn->debug_id, chan, call->call_id); ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); spin_unlock(&trans->client_lock); @@ -620,21 +619,21 @@ interrupted: */ struct rxrpc_connection * rxrpc_incoming_connection(struct rxrpc_transport *trans, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, gfp_t gfp) { struct rxrpc_connection *conn, *candidate = NULL; struct rb_node *p, **pp; const char *new = "old"; __be32 epoch; - u32 conn_id; + u32 cid; _enter(""); ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); epoch = hdr->epoch; - conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + cid = hdr->cid & RXRPC_CIDMASK; /* search the connection list first */ read_lock_bh(&trans->conn_lock); @@ -643,15 +642,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->real_conn_id); + _debug("maybe %x", conn->cid); if (epoch < conn->epoch) p = p->rb_left; else if (epoch > conn->epoch) p = p->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) p = p->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) p = p->rb_right; else goto found_extant_connection; @@ -668,12 +667,11 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, candidate->trans = trans; candidate->epoch = hdr->epoch; - candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK); + candidate->cid = hdr->cid & RXRPC_CIDMASK; candidate->service_id = hdr->serviceId; candidate->security_ix = hdr->securityIndex; candidate->in_clientflag = RXRPC_CLIENT_INITIATED; candidate->out_clientflag = 0; - candidate->real_conn_id = conn_id; candidate->state = RXRPC_CONN_SERVER; if (candidate->service_id) candidate->state = RXRPC_CONN_SERVER_UNSECURED; @@ -690,9 +688,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, pp = &(*pp)->rb_left; else if (epoch > conn->epoch) pp = &(*pp)->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) pp = &(*pp)->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) pp = &(*pp)->rb_right; else goto found_extant_second; @@ -714,7 +712,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, new = "new"; success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id); + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid); _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return conn; @@ -751,18 +749,17 @@ security_mismatch: * packet */ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, - struct rxrpc_header *hdr) + struct rxrpc_host_header *hdr) { struct rxrpc_connection *conn; struct rb_node *p; - __be32 epoch; - u32 conn_id; + u32 epoch, cid; - _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags); + _enter(",{%x,%x}", hdr->cid, hdr->flags); read_lock_bh(&trans->conn_lock); - conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + cid = hdr->cid & RXRPC_CIDMASK; epoch = hdr->epoch; if (hdr->flags & RXRPC_CLIENT_INITIATED) @@ -773,15 +770,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->real_conn_id); + _debug("maybe %x", conn->cid); if (epoch < conn->epoch) p = p->rb_left; else if (epoch > conn->epoch) p = p->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) p = p->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) p = p->rb_right; else goto found; diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c index e7ed43a54c41..1bdaaed8cdc4 100644 --- a/net/rxrpc/ar-connevent.c +++ b/net/rxrpc/ar-connevent.c @@ -42,9 +42,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, call->state = state; call->abort_code = abort_code; if (state == RXRPC_CALL_LOCALLY_ABORTED) - set_bit(RXRPC_CALL_CONN_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); else - set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); @@ -60,11 +60,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, static int rxrpc_abort_connection(struct rxrpc_connection *conn, u32 error, u32 abort_code) { - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; __be32 word; size_t len; + u32 serial; int ret; _enter("%d,,%u,%u", conn->debug_id, error, abort_code); @@ -89,28 +90,29 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = conn->epoch; - hdr.cid = conn->cid; - hdr.callNumber = 0; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_ABORT; - hdr.flags = conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = conn->service_id; + whdr.epoch = htonl(conn->epoch); + whdr.cid = htonl(conn->cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); word = htonl(abort_code); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &word; iov[1].iov_len = sizeof(word); len = iov[0].iov_len + iov[1].iov_len; - hdr.serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CONN ABORT %%%u { %d }", serial, abort_code); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -132,7 +134,7 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) if (call) { read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_SECURED, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) rxrpc_queue_call(call); read_unlock(&call->state_lock); } @@ -146,8 +148,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, u32 *_abort_code) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 tmp; - u32 serial; + __be32 wtmp; + u32 abort_code; int loop, ret; if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { @@ -155,19 +157,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return -ECONNABORTED; } - serial = ntohl(sp->hdr.serial); - - _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial); + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0) + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) return -EPROTO; - _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp)); + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->state = RXRPC_CONN_REMOTELY_ABORTED; rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - ntohl(tmp)); + abort_code); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -335,7 +336,7 @@ void rxrpc_reject_packets(struct work_struct *work) struct sockaddr_in sin; } sa; struct rxrpc_skb_priv *sp; - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct rxrpc_local *local; struct sk_buff *skb; struct msghdr msg; @@ -348,11 +349,11 @@ void rxrpc_reject_packets(struct work_struct *work) _enter("%d", local->debug_id); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(hdr) + sizeof(code); + size = sizeof(whdr) + sizeof(code); msg.msg_name = &sa; msg.msg_control = NULL; @@ -370,8 +371,8 @@ void rxrpc_reject_packets(struct work_struct *work) break; } - memset(&hdr, 0, sizeof(hdr)); - hdr.type = RXRPC_PACKET_TYPE_ABORT; + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { sp = rxrpc_skb(skb); @@ -381,13 +382,13 @@ void rxrpc_reject_packets(struct work_struct *work) sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; code = htonl(skb->priority); - hdr.epoch = sp->hdr.epoch; - hdr.cid = sp->hdr.cid; - hdr.callNumber = sp->hdr.callNumber; - hdr.serviceId = sp->hdr.serviceId; - hdr.flags = sp->hdr.flags; - hdr.flags ^= RXRPC_CLIENT_INITIATED; - hdr.flags &= RXRPC_CLIENT_INITIATED; + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; kernel_sendmsg(local->socket, &msg, iov, 2, size); break; diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 0610efa83d72..3e82d6f0313c 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -115,7 +115,6 @@ void rxrpc_UDP_error_report(struct sock *sk) /* pass the transport ref to error_handler to release */ skb_queue_tail(&trans->error_queue, skb); rxrpc_queue_work(&trans->error_handler); - _leave(""); } @@ -152,28 +151,18 @@ void rxrpc_UDP_error_handler(struct work_struct *work) switch (ee->ee_code) { case ICMP_NET_UNREACH: _net("Rx Received ICMP Network Unreachable"); - err = ENETUNREACH; break; case ICMP_HOST_UNREACH: _net("Rx Received ICMP Host Unreachable"); - err = EHOSTUNREACH; break; case ICMP_PORT_UNREACH: _net("Rx Received ICMP Port Unreachable"); - err = ECONNREFUSED; - break; - case ICMP_FRAG_NEEDED: - _net("Rx Received ICMP Fragmentation Needed (%d)", - ee->ee_info); - err = 0; /* dealt with elsewhere */ break; case ICMP_NET_UNKNOWN: _net("Rx Received ICMP Unknown Network"); - err = ENETUNREACH; break; case ICMP_HOST_UNKNOWN: _net("Rx Received ICMP Unknown Host"); - err = EHOSTUNREACH; break; default: _net("Rx Received ICMP DestUnreach code=%u", @@ -222,7 +211,7 @@ void rxrpc_UDP_error_handler(struct work_struct *work) if (call->state != RXRPC_CALL_COMPLETE && call->state < RXRPC_CALL_NETWORK_ERROR) { call->state = RXRPC_CALL_NETWORK_ERROR; - set_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 4505a691d88c..63ed75c40e29 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -231,7 +231,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, _debug("drain rx oos now"); read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) rxrpc_queue_call(call); read_unlock(&call->state_lock); } @@ -287,12 +287,12 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) call->acks_latest = serial; _debug("implicit ACKALL %%%u", call->acks_latest); - set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); write_unlock_bh(&call->state_lock); if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_RESEND, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); } break; @@ -310,8 +310,8 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 _abort_code; - u32 serial, hi_serial, seq, abort_code; + __be32 wtmp; + u32 hi_serial, abort_code; _enter("%p,%p", call, skb); @@ -330,16 +330,15 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) /* track the latest serial number on this connection for ACK packet * information */ - serial = ntohl(sp->hdr.serial); hi_serial = atomic_read(&call->conn->hi_serial); - while (serial > hi_serial) + while (sp->hdr.serial > hi_serial) hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, - serial); + sp->hdr.serial); /* request ACK generation for any ACK or DATA packet that requests * it */ if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - _proto("ACK Requested on %%%u", serial); + _proto("ACK Requested on %%%u", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); } @@ -347,24 +346,23 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_PACKET_TYPE_ABORT: _debug("abort"); - if (skb_copy_bits(skb, 0, &_abort_code, - sizeof(_abort_code)) < 0) + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) goto protocol_error; - abort_code = ntohl(_abort_code); - _proto("Rx ABORT %%%u { %x }", serial, abort_code); + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); write_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_REMOTELY_ABORTED; call->abort_code = abort_code; - set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); rxrpc_queue_call(call); } goto free_packet_unlock; case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", serial); + _proto("Rx BUSY %%%u", sp->hdr.serial); if (call->conn->out_clientflag) goto protocol_error; @@ -373,7 +371,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) switch (call->state) { case RXRPC_CALL_CLIENT_SEND_REQUEST: call->state = RXRPC_CALL_SERVER_BUSY; - set_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); rxrpc_queue_call(call); case RXRPC_CALL_SERVER_BUSY: goto free_packet_unlock; @@ -382,15 +380,13 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) } default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial); + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); goto protocol_error; case RXRPC_PACKET_TYPE_DATA: - seq = ntohl(sp->hdr.seq); + _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - _proto("Rx DATA %%%u { #%u }", serial, seq); - - if (seq == 0) + if (sp->hdr.seq == 0) goto protocol_error; call->ackr_prev_seq = sp->hdr.seq; @@ -398,9 +394,9 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) /* received data implicitly ACKs all of the request packets we * sent when we're acting as a client */ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - rxrpc_assume_implicit_ackall(call, serial); + rxrpc_assume_implicit_ackall(call, sp->hdr.serial); - switch (rxrpc_fast_process_data(call, skb, seq)) { + switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { case 0: skb = NULL; goto done; @@ -433,7 +429,7 @@ protocol_error_locked: if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } free_packet_unlock: @@ -481,12 +477,12 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, if (!pskb_pull(jumbo, sizeof(jhdr))) BUG(); - sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1); - sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1); + sp->hdr.seq += 1; + sp->hdr.serial += 1; sp->hdr.flags = jhdr.flags; sp->hdr._rsvd = jhdr._rsvd; - _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1); + _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); rxrpc_fast_process_packet(call, part); part = NULL; @@ -505,7 +501,7 @@ protocol_error: if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock_bh(&call->state_lock); @@ -530,7 +526,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) { + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_queue_call(call); goto free_unlock; } @@ -546,7 +542,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, /* resend last packet of a completed call */ _debug("final ack again"); rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); rxrpc_queue_call(call); goto free_unlock; default: @@ -607,6 +603,35 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, rxrpc_queue_work(&local->event_processor); } +/* + * Extract the wire header from a packet and translate the byte order. + */ +static noinline +int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0) + return -EBADMSG; + if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr))) + BUG(); + + memset(sp, 0, sizeof(*sp)); + sp->hdr.epoch = ntohl(whdr.epoch); + sp->hdr.cid = ntohl(whdr.cid); + sp->hdr.callNumber = ntohl(whdr.callNumber); + sp->hdr.seq = ntohl(whdr.seq); + sp->hdr.serial = ntohl(whdr.serial); + sp->hdr.flags = whdr.flags; + sp->hdr.type = whdr.type; + sp->hdr.userStatus = whdr.userStatus; + sp->hdr.securityIndex = whdr.securityIndex; + sp->hdr._rsvd = ntohs(whdr._rsvd); + sp->hdr.serviceId = ntohs(whdr.serviceId); + return 0; +} + static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb, struct rxrpc_skb_priv *sp) @@ -686,29 +711,25 @@ void rxrpc_data_ready(struct sock *sk) UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0); - /* the socket buffer we have is owned by UDP, with UDP's data all over - * it, but we really want our own */ + /* The socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own data there. + */ skb_orphan(skb); sp = rxrpc_skb(skb); - memset(sp, 0, sizeof(*sp)); _net("Rx UDP packet from %08x:%04hu", ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr, - sizeof(sp->hdr)) < 0) + if (rxrpc_extract_header(sp, skb) < 0) goto bad_message; - if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr))) - BUG(); _net("Rx RxRPC %s ep=%x call=%x:%x", sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - ntohl(sp->hdr.epoch), - ntohl(sp->hdr.cid), - ntohl(sp->hdr.callNumber)); + sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) { + if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || + !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { _proto("Rx Bad Packet Type %u", sp->hdr.type); goto bad_message; } @@ -737,14 +758,9 @@ void rxrpc_data_ready(struct sock *sk) rxrpc_put_connection(conn); } else { struct rxrpc_call *call; - u8 in_clientflag = 0; - - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) - in_clientflag = RXRPC_CLIENT_INITIATED; - call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid, - sp->hdr.callNumber, sp->hdr.epoch, - sp->hdr.serviceId, local, AF_INET, - (u8 *)&ip_hdr(skb)->saddr); + + call = rxrpc_find_call_hash(&sp->hdr, local, + AF_INET, &ip_hdr(skb)->saddr); if (call) rxrpc_post_packet_to_call(call, skb); else @@ -759,7 +775,7 @@ cant_route_call: _debug("can't route call"); if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == cpu_to_be32(1)) { + if (sp->hdr.seq == 1) { _debug("first packet"); skb_queue_tail(&local->accept_queue, skb); rxrpc_queue_work(&local->acceptor); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2934a73a5981..a3002f4ddc90 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -16,7 +16,7 @@ BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \ (POISON_FREE << 8 | POISON_FREE)) #else -#define CHECK_SLAB_OKAY(X) do {} while(0) +#define CHECK_SLAB_OKAY(X) do {} while (0) #endif #define FCRYPT_BSIZE 8 @@ -70,12 +70,31 @@ struct rxrpc_sock { #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT struct sockaddr_rxrpc srx; /* local address */ sa_family_t proto; /* protocol created with */ - __be16 service_id; /* service ID of local/remote service */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) /* + * CPU-byteorder normalised Rx packet header. + */ +struct rxrpc_host_header { + u32 epoch; /* client boot timestamp */ + u32 cid; /* connection and channel ID */ + u32 callNumber; /* call ID (0 for connection-level packets) */ + u32 seq; /* sequence number of pkt in call stream */ + u32 serial; /* serial number of pkt sent to network */ + u8 type; /* packet type */ + u8 flags; /* packet flags */ + u8 userStatus; /* app-layer defined status */ + u8 securityIndex; /* security protocol ID */ + union { + u16 _rsvd; /* reserved */ + u16 cksum; /* kerberos security checksum */ + }; + u16 serviceId; /* service ID */ +} __packed; + +/* * RxRPC socket buffer private variables * - max 48 bytes (struct sk_buff::cb) */ @@ -89,7 +108,7 @@ struct rxrpc_skb_priv { bool need_resend; /* T if needs resending */ }; - struct rxrpc_header hdr; /* RxRPC packet header from this packet */ + struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) @@ -230,7 +249,7 @@ struct rxrpc_conn_bundle { atomic_t usage; int debug_id; /* debug ID for printks */ unsigned short num_conns; /* number of connections in this bundle */ - __be16 service_id; /* service ID */ + u16 service_id; /* Service ID for this bundle */ u8 security_ix; /* security type */ }; @@ -260,7 +279,6 @@ struct rxrpc_connection { rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; - u32 real_conn_id; /* connection ID (host-endian) */ enum { /* current state of connection */ RXRPC_CONN_UNUSED, /* - connection not yet attempted */ RXRPC_CONN_CLIENT, /* - client connection */ @@ -282,17 +300,76 @@ struct rxrpc_connection { u8 security_size; /* security header size */ u32 security_level; /* security level negotiated */ u32 security_nonce; /* response re-use preventer */ - - /* the following are all in net order */ - __be32 epoch; /* epoch of this connection */ - __be32 cid; /* connection ID */ - __be16 service_id; /* service ID */ + u32 epoch; /* epoch of this connection */ + u32 cid; /* connection ID */ + u16 service_id; /* service ID for this connection */ u8 security_ix; /* security type */ u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; /* + * Flags in call->flags. + */ +enum rxrpc_call_flag { + RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ + RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */ + RXRPC_CALL_RCVD_LAST, /* all packets received */ + RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */ + RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */ + RXRPC_CALL_PROC_BUSY, /* the processor is busy */ + RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */ + RXRPC_CALL_HAS_USERID, /* has a user ID attached */ + RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ +}; + +/* + * Events that can be raised on a call. + */ +enum rxrpc_call_event { + RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */ + RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */ + RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */ + RXRPC_CALL_EV_RCVD_ERROR, /* network error received */ + RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */ + RXRPC_CALL_EV_ACK, /* need to generate ACK */ + RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */ + RXRPC_CALL_EV_ABORT, /* need to generate abort */ + RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */ + RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */ + RXRPC_CALL_EV_RESEND, /* Tx resend required */ + RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */ + RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */ + RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */ + RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */ + RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */ + RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */ +}; + +/* + * The states that a call can be in. + */ +enum rxrpc_call_state { + RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ + RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ + RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ + RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ + RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ + RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ + RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ + RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ + RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ + RXRPC_CALL_COMPLETE, /* - call completed */ + RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + RXRPC_CALL_DEAD, /* - call is dead */ + NR__RXRPC_CALL_STATES +}; + +/* * RxRPC call definition * - matched by { connection, call_id } */ @@ -317,57 +394,13 @@ struct rxrpc_call { unsigned long user_call_ID; /* user-defined call ID */ unsigned long creation_jif; /* time of call creation */ unsigned long flags; -#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */ -#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */ -#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */ -#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */ -#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */ -#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */ -#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */ -#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */ -#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */ unsigned long events; -#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */ -#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */ -#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */ -#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */ -#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */ -#define RXRPC_CALL_ACK 5 /* need to generate ACK */ -#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */ -#define RXRPC_CALL_ABORT 7 /* need to generate abort */ -#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */ -#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */ -#define RXRPC_CALL_RESEND 10 /* Tx resend required */ -#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */ -#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */ -#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */ -#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */ -#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */ -#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */ - spinlock_t lock; rwlock_t state_lock; /* lock for state transition */ atomic_t usage; atomic_t sequence; /* Tx data packet sequence counter */ u32 abort_code; /* local/remote abort code */ - enum { /* current state of call */ - RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ - RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ - RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ - RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ - RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ - RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ - RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ - RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ - RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ - RXRPC_CALL_COMPLETE, /* - call completed */ - RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - RXRPC_CALL_DEAD, /* - call is dead */ - } state; + enum rxrpc_call_state state : 8; /* current state of call */ int debug_id; /* debug ID for printks */ u8 channel; /* connection channel occupied by this call */ @@ -389,9 +422,9 @@ struct rxrpc_call { rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ - rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */ + rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ u8 ackr_reason; /* reason to ACK */ - __be32 ackr_serial; /* serial of packet being ACK'd */ + rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ atomic_t ackr_not_idle; /* number of packets in Rx queue */ /* received packet records, 1 bit per record */ @@ -403,11 +436,10 @@ struct rxrpc_call { u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ sa_family_t proto; /* Frame protocol */ - /* the following should all be in net order */ - __be32 cid; /* connection ID + channel index */ - __be32 call_id; /* call ID on connection */ - __be32 epoch; /* epoch of this connection */ - __be16 service_id; /* service ID */ + u32 call_id; /* call ID on connection */ + u32 cid; /* connection ID plus channel index */ + u32 epoch; /* epoch of this connection */ + u16 service_id; /* service ID */ union { /* Peer IP address for hashing */ __be32 ipv4_addr; __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ @@ -423,7 +455,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) if (call->state < RXRPC_CALL_COMPLETE) { call->abort_code = abort_code; call->state = RXRPC_CALL_LOCALLY_ABORTED; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); } write_unlock_bh(&call->state_lock); } @@ -432,7 +464,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) * af_rxrpc.c */ extern atomic_t rxrpc_n_skbs; -extern __be32 rxrpc_epoch; +extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -446,35 +478,35 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * ar-ack.c */ -extern unsigned rxrpc_requested_ack_delay; -extern unsigned rxrpc_soft_ack_delay; -extern unsigned rxrpc_idle_ack_delay; -extern unsigned rxrpc_rx_window_size; -extern unsigned rxrpc_rx_mtu; -extern unsigned rxrpc_rx_jumbo_max; +extern unsigned int rxrpc_requested_ack_delay; +extern unsigned int rxrpc_soft_ack_delay; +extern unsigned int rxrpc_idle_ack_delay; +extern unsigned int rxrpc_rx_window_size; +extern unsigned int rxrpc_rx_mtu; +extern unsigned int rxrpc_rx_jumbo_max; -void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); +void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_process_call(struct work_struct *); /* * ar-call.c */ -extern unsigned rxrpc_max_call_lifetime; -extern unsigned rxrpc_dead_call_expiry; +extern unsigned int rxrpc_max_call_lifetime; +extern unsigned int rxrpc_dead_call_expiry; extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; -struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32, - __be16, void *, sa_family_t, const u8 *); +struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, + void *, sa_family_t, const void *); struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, unsigned long, int, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_header *, gfp_t); + struct rxrpc_host_header *, gfp_t); struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); @@ -484,22 +516,22 @@ void __exit rxrpc_destroy_all_calls(void); /* * ar-connection.c */ -extern unsigned rxrpc_connection_expiry; +extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, struct rxrpc_transport *, - struct key *, __be16, gfp_t); + struct key *, u16, gfp_t); void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, - struct rxrpc_header *); + struct rxrpc_host_header *); extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *, +rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *, gfp_t); /* @@ -547,7 +579,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, /* * ar-output.c */ -extern unsigned rxrpc_resend_timeout; +extern unsigned int rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *, @@ -595,7 +627,7 @@ void rxrpc_packet_destructor(struct sk_buff *); /* * ar-transport.c */ -extern unsigned rxrpc_transport_expiry; +extern unsigned int rxrpc_transport_expiry; struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, struct rxrpc_peer *, gfp_t); @@ -694,7 +726,7 @@ do { \ printk(KERN_ERR "RxRPC: Assertion failed\n"); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTCMP(X, OP, Y) \ do { \ @@ -707,7 +739,7 @@ do { \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTIF(C, X) \ do { \ @@ -716,7 +748,7 @@ do { \ printk(KERN_ERR "RxRPC: Assertion failed\n"); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ @@ -729,25 +761,25 @@ do { \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ -} while(0) +} while (0) #else #define ASSERT(X) \ do { \ -} while(0) +} while (0) #define ASSERTCMP(X, OP, Y) \ do { \ -} while(0) +} while (0) #define ASSERTIF(C, X) \ do { \ -} while(0) +} while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ -} while(0) +} while (0) #endif /* __KDEBUGALL */ @@ -804,9 +836,9 @@ do { \ CHECK_SLAB_OKAY(&(CALL)->usage); \ if (atomic_inc_return(&(CALL)->usage) == 1) \ BUG(); \ -} while(0) +} while (0) #define rxrpc_put_call(CALL) \ do { \ __rxrpc_put_call(CALL); \ -} while(0) +} while (0) diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 78483b4602bf..4e1e6db0050b 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -323,9 +323,11 @@ void __exit rxrpc_destroy_all_locals(void) * Reply to a version request */ static void rxrpc_send_version_request(struct rxrpc_local *local, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, struct sk_buff *skb) { + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sockaddr_in sin; struct msghdr msg; struct kvec iov[2]; @@ -344,15 +346,20 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->seq = 0; - hdr->serial = 0; - hdr->type = RXRPC_PACKET_TYPE_VERSION; - hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); - hdr->userStatus = 0; - hdr->_rsvd = 0; - - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; + whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = (char *)rxrpc_version_string; iov[1].iov_len = sizeof(rxrpc_version_string); @@ -383,7 +390,7 @@ static void rxrpc_process_local_events(struct work_struct *work) while ((skb = skb_dequeue(&local->event_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - kdebug("{%d},{%u}", local->debug_id, sp->hdr.type); + _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 14c4e12c47b0..d36fb6e1a29c 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -21,7 +21,7 @@ /* * Time till packet resend (in jiffies). */ -unsigned rxrpc_resend_timeout = 4 * HZ; +unsigned int rxrpc_resend_timeout = 4 * HZ; static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -111,11 +111,11 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = abort_code; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); rxrpc_queue_call(call); } @@ -136,7 +136,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, struct rxrpc_call *call; unsigned long user_call_ID = 0; struct key *key; - __be16 service_id; + u16 service_id; u32 abort_code = 0; int ret; @@ -151,11 +151,11 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, bundle = NULL; if (trans) { - service_id = rx->service_id; + service_id = rx->srx.srx_service; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); - service_id = htons(srx->srx_service); + service_id = srx->srx_service; } key = rx->key; if (key && !rx->key->payload.data[0]) @@ -348,7 +348,7 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) { + if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) { down_read(&trans->local->defrag_sem); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet @@ -401,7 +401,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, int ret; _enter(",{%d},%ld", - CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz), + CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz), *timeo); add_wait_queue(&call->tx_waitq, &myself); @@ -409,7 +410,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, for (;;) { set_current_state(TASK_INTERRUPTIBLE); ret = 0; - if (CIRC_SPACE(call->acks_head, call->acks_tail, + if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), call->acks_winsz) > 0) break; if (signal_pending(current)) { @@ -437,7 +438,7 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call) if (try_to_del_timer_sync(&call->resend_timer) >= 0) { clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_queue_call(call); } read_unlock_bh(&call->state_lock); @@ -480,8 +481,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, write_unlock_bh(&call->state_lock); } - _proto("Tx DATA %%%u { #%u }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); sp->need_resend = false; sp->resend_at = jiffies + rxrpc_resend_timeout; @@ -513,6 +513,29 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, } /* + * Convert a host-endian header into a network-endian header. + */ +static void rxrpc_insert_header(struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(sp->hdr.serial); + whdr.type = sp->hdr.type; + whdr.flags = sp->hdr.flags; + whdr.userStatus = sp->hdr.userStatus; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(sp->hdr.serviceId); + + memcpy(skb->head, &whdr, sizeof(whdr)); +} + +/* * send data through a socket * - must be called in process context * - caller holds the socket locked @@ -548,7 +571,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (CIRC_SPACE(call->acks_head, call->acks_tail, + if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), call->acks_winsz) <= 0) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) @@ -650,22 +674,22 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, seq = atomic_inc_return(&call->sequence); - sp->hdr.epoch = conn->epoch; - sp->hdr.cid = call->cid; + sp->hdr.epoch = conn->epoch; + sp->hdr.cid = call->cid; sp->hdr.callNumber = call->call_id; - sp->hdr.seq = htonl(seq); - sp->hdr.serial = - htonl(atomic_inc_return(&conn->serial)); - sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.seq = seq; + sp->hdr.serial = atomic_inc_return(&conn->serial); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; sp->hdr.userStatus = 0; sp->hdr.securityIndex = conn->security_ix; - sp->hdr._rsvd = 0; - sp->hdr.serviceId = conn->service_id; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = call->service_id; sp->hdr.flags = conn->out_clientflag; if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (CIRC_SPACE(call->acks_head, call->acks_tail, + else if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), call->acks_winsz) > 1) sp->hdr.flags |= RXRPC_MORE_PACKETS; if (more && seq & 1) @@ -673,12 +697,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ret = rxrpc_secure_packet( call, skb, skb->mark, - skb->head + sizeof(struct rxrpc_header)); + skb->head + sizeof(struct rxrpc_wire_header)); if (ret < 0) goto out; - memcpy(skb->head, &sp->hdr, - sizeof(struct rxrpc_header)); + rxrpc_insert_header(skb); rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c index bebaa43484bc..dc089b1976aa 100644 --- a/net/rxrpc/ar-peer.c +++ b/net/rxrpc/ar-peer.c @@ -92,7 +92,7 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, BUG(); } - peer->hdrsize += sizeof(struct rxrpc_header); + peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->maxdata = peer->mtu - peer->hdrsize; } diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c index 38047f713f2c..525b2ba5a8f4 100644 --- a/net/rxrpc/ar-proc.c +++ b/net/rxrpc/ar-proc.c @@ -74,9 +74,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) " %-8.8s %08x %lx\n", lbuff, rbuff, - ntohs(call->conn->service_id), - ntohl(call->conn->cid), - ntohl(call->call_id), + call->conn->service_id, + call->cid, + call->call_id, call->conn->in_clientflag ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], @@ -157,8 +157,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) " %s %08x %08x %08x\n", lbuff, rbuff, - ntohs(conn->service_id), - ntohl(conn->cid), + conn->service_id, + conn->cid, conn->call_counter, conn->in_clientflag ? "Svc" : "Clt", atomic_read(&conn->usage), diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index b92beded7459..64facba24a45 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -33,7 +33,7 @@ void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -158,7 +158,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto receive_non_data_message; _debug("recvmsg DATA #%u { %d, %d }", - ntohl(sp->hdr.seq), skb->len, sp->offset); + sp->hdr.seq, skb->len, sp->offset); if (!continue_call) { /* only set the control data once per recvmsg() */ @@ -169,11 +169,11 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); } - ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); - ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); - call->rx_data_recv = ntohl(sp->hdr.seq); + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; - ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); offset = sp->offset; copy = skb->len - offset; @@ -364,11 +364,11 @@ void rxrpc_kernel_data_delivered(struct sk_buff *skb) struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call = sp->call; - ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); - ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); - call->rx_data_recv = ntohl(sp->hdr.seq); + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; - ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); rxrpc_free_skb(skb); } diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c index 8334474eb26c..ceff6394a65f 100644 --- a/net/rxrpc/ar-security.c +++ b/net/rxrpc/ar-security.c @@ -167,11 +167,11 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) struct rxrpc_sock *rx; struct key *key; key_ref_t kref; - char kdesc[5+1+3+1]; + char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix); + sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); sec = rxrpc_security_lookup(conn->security_ix); if (!sec) { @@ -182,7 +182,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) /* find the service */ read_lock_bh(&local->services_lock); list_for_each_entry(rx, &local->services, listen_link) { - if (rx->service_id == conn->service_id) + if (rx->srx.srx_service == conn->service_id) goto found_service; } diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c index 4cfab49e329d..62a267472fce 100644 --- a/net/rxrpc/ar-skbuff.c +++ b/net/rxrpc/ar-skbuff.c @@ -34,7 +34,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call) /* get an extra ref on the call for the final-ACK generator to * release */ rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); if (try_to_del_timer_sync(&call->ack_timer) >= 0) rxrpc_queue_call(call); break; @@ -59,7 +59,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, spin_lock_bh(&call->lock); - _debug("hard ACK #%u", ntohl(sp->hdr.seq)); + _debug("hard ACK #%u", sp->hdr.seq); for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { call->ackr_window[loop] >>= 1; @@ -67,7 +67,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); } - seq = ntohl(sp->hdr.seq); + seq = sp->hdr.seq; ASSERTCMP(seq, ==, call->rx_data_eaten + 1); call->rx_data_eaten = seq; @@ -133,5 +133,4 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb) { rxrpc_free_skb(skb); } - EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 9946467f16b4..66a1a5676446 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -20,7 +20,7 @@ /* * Time after last use at which transport record is cleaned up. */ -unsigned rxrpc_transport_expiry = 3600 * 24; +unsigned int rxrpc_transport_expiry = 3600 * 24; static void rxrpc_transport_reaper(struct work_struct *work); @@ -51,6 +51,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, spin_lock_init(&trans->client_lock); rwlock_init(&trans->conn_lock); atomic_set(&trans->usage, 1); + trans->conn_idcounter = peer->srx.srx_service << 16; trans->debug_id = atomic_inc_return(&rxrpc_debug_id); if (peer->srx.transport.family == AF_INET) { diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index d7a9ab5a9d9c..3106a0c4960b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -132,8 +132,8 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn) desc.info = iv.x; desc.flags = 0; - tmpbuf.x[0] = conn->epoch; - tmpbuf.x[1] = conn->cid; + tmpbuf.x[0] = htonl(conn->epoch); + tmpbuf.x[1] = htonl(conn->cid); tmpbuf.x[2] = 0; tmpbuf.x[3] = htonl(conn->security_ix); @@ -142,7 +142,7 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn) crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); - ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]); + ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]); _leave(""); } @@ -169,8 +169,8 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); - data_size |= (u32) check << 16; + check = sp->hdr.seq ^ sp->hdr.callNumber; + data_size |= (u32)check << 16; tmpbuf.hdr.data_size = htonl(data_size); memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); @@ -195,9 +195,9 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, * wholly encrypt a packet (level 2 security) */ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 data_size, - void *sechdr) + struct sk_buff *skb, + u32 data_size, + void *sechdr) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr rxkhdr @@ -215,9 +215,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, _enter(""); - check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check = sp->hdr.seq ^ sp->hdr.callNumber; - rxkhdr.data_size = htonl(data_size | (u32) check << 16); + rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; /* encrypt from the session key */ @@ -251,9 +251,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, * checksum an RxRPC packet header */ static int rxkad_secure_packet(const struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) + struct sk_buff *skb, + size_t data_size, + void *sechdr) { struct rxrpc_skb_priv *sp; struct blkcipher_desc desc; @@ -262,14 +262,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, struct { __be32 x[2]; } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ - __be32 x; - u32 y; + u32 x, y; int ret; sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u},%zu,", - call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq), + call->debug_id, key_serial(call->conn->key), sp->hdr.seq, data_size); if (!call->conn->cipher) @@ -286,10 +285,10 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, desc.flags = 0; /* calculate the security checksum */ - x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); - tmpbuf.x[0] = sp->hdr.callNumber; - tmpbuf.x[1] = x; + x = call->channel << (32 - RXRPC_CIDSHIFT); + x |= sp->hdr.seq & 0x3fffffff; + tmpbuf.x[0] = htonl(sp->hdr.callNumber); + tmpbuf.x[1] = htonl(x); sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); @@ -299,7 +298,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ - sp->hdr.cksum = htons(y); + sp->hdr.cksum = y; switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: @@ -368,7 +367,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check ^= sp->hdr.seq ^ sp->hdr.callNumber; check &= 0xffff; if (check != 0) { *_abort_code = RXKADSEALEDINCON; @@ -453,7 +452,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check ^= sp->hdr.seq ^ sp->hdr.callNumber; check &= 0xffff; if (check != 0) { *_abort_code = RXKADSEALEDINCON; @@ -494,16 +493,14 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, struct { __be32 x[2]; } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ - __be32 x; - __be16 cksum; - u32 y; + u16 cksum; + u32 x, y; int ret; sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->key), - ntohl(sp->hdr.seq)); + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); if (!call->conn->cipher) return 0; @@ -521,21 +518,20 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, desc.flags = 0; /* validate the security checksum */ - x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); - tmpbuf.x[0] = call->call_id; - tmpbuf.x[1] = x; + x = call->channel << (32 - RXRPC_CIDSHIFT); + x |= sp->hdr.seq & 0x3fffffff; + tmpbuf.x[0] = htonl(call->call_id); + tmpbuf.x[1] = htonl(x); sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); y = ntohl(tmpbuf.x[1]); - y = (y >> 16) & 0xffff; - if (y == 0) - y = 1; /* zero checksums are not permitted */ + cksum = (y >> 16) & 0xffff; + if (cksum == 0) + cksum = 1; /* zero checksums are not permitted */ - cksum = htons(y); if (sp->hdr.cksum != cksum) { *_abort_code = RXKADSEALEDINCON; _leave(" = -EPROTO [csum failed]"); @@ -567,10 +563,11 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, static int rxkad_issue_challenge(struct rxrpc_connection *conn) { struct rxkad_challenge challenge; - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; size_t len; + u32 serial; int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); @@ -592,26 +589,27 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = conn->epoch; - hdr.cid = conn->cid; - hdr.callNumber = 0; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_CHALLENGE; - hdr.flags = conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = conn->service_id; - - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + whdr.epoch = htonl(conn->epoch); + whdr.cid = htonl(conn->cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &challenge; iov[1].iov_len = sizeof(challenge); len = iov[0].iov_len + iov[1].iov_len; - hdr.serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial)); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CHALLENGE %%%u", serial); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -627,13 +625,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) * send a Kerberos security response */ static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, struct rxkad_response *resp, const struct rxkad_key *s2) { + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[3]; size_t len; + u32 serial; int ret; _enter(""); @@ -644,24 +644,26 @@ static int rxkad_send_response(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->epoch = conn->epoch; - hdr->seq = 0; - hdr->type = RXRPC_PACKET_TYPE_RESPONSE; - hdr->flags = conn->out_clientflag; - hdr->userStatus = 0; - hdr->_rsvd = 0; + memset(&whdr, 0, sizeof(whdr)); + whdr.epoch = htonl(hdr->epoch); + whdr.cid = htonl(hdr->cid); + whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + whdr.flags = conn->out_clientflag; + whdr.securityIndex = hdr->securityIndex; + whdr.serviceId = htons(hdr->serviceId); - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = resp; iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *) s2->ticket; + iov[2].iov_base = (void *)s2->ticket; iov[2].iov_len = s2->ticket_len; len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - hdr->serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx RESPONSE %%%u", ntohl(hdr->serial)); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx RESPONSE %%%u", serial); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); if (ret < 0) { @@ -770,7 +772,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, min_level = ntohl(challenge.min_level); _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", - ntohl(sp->hdr.serial), version, nonce, min_level); + sp->hdr.serial, version, nonce, min_level); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) @@ -785,22 +787,23 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, /* build the response packet */ memset(&resp, 0, sizeof(resp)); - resp.version = RXKAD_VERSION; - resp.encrypted.epoch = conn->epoch; - resp.encrypted.cid = conn->cid; - resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.version = htonl(RXKAD_VERSION); + resp.encrypted.epoch = htonl(conn->epoch); + resp.encrypted.cid = htonl(conn->cid); + resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.encrypted.inc_nonce = htonl(nonce + 1); + resp.encrypted.level = htonl(conn->security_level); + resp.kvno = htonl(token->kad->kvno); + resp.ticket_len = htonl(token->kad->ticket_len); + resp.encrypted.call_id[0] = - (conn->channels[0] ? conn->channels[0]->call_id : 0); + htonl(conn->channels[0] ? conn->channels[0]->call_id : 0); resp.encrypted.call_id[1] = - (conn->channels[1] ? conn->channels[1]->call_id : 0); + htonl(conn->channels[1] ? conn->channels[1]->call_id : 0); resp.encrypted.call_id[2] = - (conn->channels[2] ? conn->channels[2]->call_id : 0); + htonl(conn->channels[2] ? conn->channels[2]->call_id : 0); resp.encrypted.call_id[3] = - (conn->channels[3] ? conn->channels[3]->call_id : 0); - resp.encrypted.inc_nonce = htonl(nonce + 1); - resp.encrypted.level = htonl(conn->security_level); - resp.kvno = htonl(token->kad->kvno); - resp.ticket_len = htonl(token->kad->ticket_len); + htonl(conn->channels[3] ? conn->channels[3]->call_id : 0); /* calculate the response checksum and then do the encryption */ rxkad_calc_response_checksum(&resp); @@ -1022,7 +1025,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, kvno = ntohl(response.kvno); sp = rxrpc_skb(skb); _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", - ntohl(sp->hdr.serial), version, kvno, ticket_len); + sp->hdr.serial, version, kvno, ticket_len); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) @@ -1058,9 +1061,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, rxkad_decrypt_response(conn, &response, &session_key); abort_code = RXKADSEALEDINCON; - if (response.encrypted.epoch != conn->epoch) + if (ntohl(response.encrypted.epoch) != conn->epoch) goto protocol_error_free; - if (response.encrypted.cid != conn->cid) + if (ntohl(response.encrypted.cid) != conn->cid) goto protocol_error_free; if (ntohl(response.encrypted.securityIndex) != conn->security_ix) goto protocol_error_free; @@ -1077,7 +1080,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, goto protocol_error_free; abort_code = RXKADOUTOFSEQUENCE; - if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1)) + if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) goto protocol_error_free; abort_code = RXKADLEVELFAIL; diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 50a98a910eb1..d20ed575acf4 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -15,11 +15,11 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; -static const unsigned zero = 0; -static const unsigned one = 1; -static const unsigned four = 4; -static const unsigned n_65535 = 65535; -static const unsigned n_max_acks = RXRPC_MAXACKS; +static const unsigned int zero = 0; +static const unsigned int one = 1; +static const unsigned int four = 4; +static const unsigned int n_65535 = 65535; +static const unsigned int n_max_acks = RXRPC_MAXACKS; /* * RxRPC operating parameters. @@ -32,7 +32,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "req_ack_delay", .data = &rxrpc_requested_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&zero, @@ -40,7 +40,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -48,7 +48,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -56,7 +56,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "resend_timeout", .data = &rxrpc_resend_timeout, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -66,7 +66,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "max_call_lifetime", .data = &rxrpc_max_call_lifetime, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, .extra1 = (void *)&one, @@ -74,7 +74,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "dead_call_expiry", .data = &rxrpc_dead_call_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, .extra1 = (void *)&one, @@ -84,7 +84,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "connection_expiry", .data = &rxrpc_connection_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -92,7 +92,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "transport_expiry", .data = &rxrpc_transport_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -102,7 +102,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "rx_window_size", .data = &rxrpc_rx_window_size, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -111,16 +111,16 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "rx_mtu", .data = &rxrpc_rx_mtu, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, - .extra1 = (void *)&n_65535, + .extra2 = (void *)&n_65535, }, { .procname = "rx_jumbo_max", .data = &rxrpc_rx_jumbo_max, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 82830824fb1f..b148302bbaf2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -739,6 +739,28 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_IFE + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow for sourcing and terminating metadata + For details refer to netdev01 paper: + "Distributing Linux Traffic Control Classifier-Action Subsystem" + Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + + To compile this code as a module, choose M here: the + module will be called act_ife. + +config NET_IFE_SKBMARK + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE + ---help--- + +config NET_IFE_SKBPRIO + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE + ---help--- + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 690c1689e090..84bddb373517 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -19,6 +19,9 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_IFE) += act_ife.o +obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o +obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 06e7c4a37245..96066665e376 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head) kfree(p); } -static void tcf_hash_destroy(struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfc_head); @@ -68,8 +67,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { if (a->ops->cleanup) a->ops->cleanup(a, bind); - tcf_hash_destroy(a); - ret = 1; + tcf_hash_destroy(a->hinfo, a); + ret = ACT_P_DELETED; } } @@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_hash_release); -static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, - struct tc_action *a) +static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct netlink_callback *cb, struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -126,9 +124,9 @@ nla_put_failure: goto done; } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) +static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct hlist_node *n; struct tcf_common *p; @@ -163,18 +161,24 @@ nla_put_failure: return ret; } -static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { + struct tcf_hashinfo *hinfo = tn->hinfo; + + a->hinfo = hinfo; + if (type == RTM_DELACTION) { - return tcf_del_walker(skb, a); + return tcf_del_walker(hinfo, skb, a); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb, a); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; } } +EXPORT_SYMBOL(tcf_generic_walker); static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { @@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) return p; } -u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tc_action_net *tn) { + struct tcf_hashinfo *hinfo = tn->hinfo; u32 val = hinfo->index; do { @@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = tcf_hash_lookup(index, hinfo); if (p) { a->priv = p; + a->hinfo = hinfo; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(u32 index, struct tc_action *a, int bind) +int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; a->priv = p; + a->hinfo = hinfo; return 1; } return 0; @@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) } EXPORT_SYMBOL(tcf_hash_cleanup); -int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind, bool cpustats) +int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, bool cpustats) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) @@ -272,7 +280,7 @@ err2: } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); + p->tcfc_index = index ? index : tcf_hash_new_index(tn); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { @@ -286,14 +294,15 @@ err2: } a->priv = (void *) p; + a->hinfo = hinfo; return 0; } EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tc_action *a) +void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); @@ -302,59 +311,78 @@ void tcf_hash_insert(struct tc_action *a) } EXPORT_SYMBOL(tcf_hash_insert); +void tcf_hashinfo_destroy(const struct tc_action_ops *ops, + struct tcf_hashinfo *hinfo) +{ + struct tc_action a = { + .ops = ops, + .hinfo = hinfo, + }; + int i; + + for (i = 0; i < hinfo->hmask + 1; i++) { + struct tcf_common *p; + struct hlist_node *n; + + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + int ret; + + a.priv = p; + ret = __tcf_hash_release(&a, false, true); + if (ret == ACT_P_DELETED) + module_put(ops->owner); + else if (ret < 0) + return; + } + } + kfree(hinfo->htab); +} +EXPORT_SYMBOL(tcf_hashinfo_destroy); + static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act, unsigned int mask) +int tcf_register_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; - int err; + int ret; - /* Must supply act, dump and init */ - if (!act->act || !act->dump || !act->init) + if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) return -EINVAL; - /* Supply defaults */ - if (!act->lookup) - act->lookup = tcf_hash_search; - if (!act->walk) - act->walk = tcf_generic_walker; - - act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); - if (!act->hinfo) - return -ENOMEM; - err = tcf_hashinfo_init(act->hinfo, mask); - if (err) { - kfree(act->hinfo); - return err; - } - write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); return -EEXIST; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); + + ret = register_pernet_subsys(ops); + if (ret) { + tcf_unregister_action(act, ops); + return ret; + } + return 0; } EXPORT_SYMBOL(tcf_register_action); -int tcf_unregister_action(struct tc_action_ops *act) +int tcf_unregister_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; + unregister_pernet_subsys(ops); + write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); err = 0; break; } @@ -721,8 +749,8 @@ static struct tc_action *create_a(int i) return act; } -static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) +static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; @@ -749,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; err = -ENOENT; - if (a->ops->lookup(a, index) == 0) + if (a->ops->lookup(net, a, index) == 0) goto err_mod; module_put(a->ops->owner); @@ -819,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); + err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); if (err < 0) goto out_module_put; if (err == 0) @@ -897,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, portid); + act = tcf_action_get_1(net, tb[i], n, portid); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; @@ -1044,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n) static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -1078,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0bc6f912f870..8c9f1f0459ab 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -33,6 +33,8 @@ struct tcf_bpf_cfg { bool is_ebpf; }; +static int bpf_net_id; + static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { @@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *act, int replace, int bind) { + struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; @@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_hash_check(parm->index, act, bind)) { - ret = tcf_hash_create(parm->index, est, act, + if (!tcf_hash_check(tn, parm->index, act, bind)) { + ret = tcf_hash_create(tn, parm->index, est, act, sizeof(*prog), bind, true); if (ret < 0) return ret; @@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(act); + tcf_hash_insert(tn, act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) tcf_bpf_cfg_cleanup(&tmp); } +static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, + .walk = tcf_bpf_walker, + .lookup = tcf_bpf_search, +}; + +static __net_init int bpf_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK); +} + +static void __net_exit bpf_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations bpf_net_ops = { + .init = bpf_init_net, + .exit = bpf_exit_net, + .id = &bpf_net_id, + .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { - return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK); + return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { - tcf_unregister_action(&act_bpf_ops); + tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index bb41699c6c49..c0ed93ce2391 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -30,6 +30,8 @@ #define CONNMARK_TAB_MASK 3 +static int connmark_net_id; + static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; struct tcf_connmark_info *ci; struct tc_connmark *parm; @@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*ci), bind, false); if (ret) return ret; @@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(a); + tcf_hash_insert(tn, a); ret = ACT_P_CREATED; } else { ci = to_connmark(a); @@ -169,6 +172,22 @@ nla_put_failure: return -1; } +static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark, .dump = tcf_connmark_dump, .init = tcf_connmark_init, + .walk = tcf_connmark_walker, + .lookup = tcf_connmark_search, +}; + +static __net_init int connmark_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK); +} + +static void __net_exit connmark_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations connmark_net_ops = { + .init = connmark_init_net, + .exit = connmark_exit_net, + .id = &connmark_net_id, + .size = sizeof(struct tc_action_net), }; static int __init connmark_init_module(void) { - return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK); + return tcf_register_action(&act_connmark_ops, &connmark_net_ops); } static void __exit connmark_cleanup_module(void) { - tcf_unregister_action(&act_connmark_ops); + tcf_unregister_action(&act_connmark_ops, &connmark_net_ops); } module_init(connmark_init_module); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b07c535ba8e7..d22426cdebc0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int csum_net_id; + +static int tcf_csum_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { + struct tc_action_net *tn = net_generic(net, csum_net_id); struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -105,9 +109,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, int hl = ihl + jhl; if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || - (skb_cloned(skb) && - !skb_clone_writable(skb, hl + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, hl + ntkoff)) return NULL; else return (void *)(skb_network_header(skb) + ihl); @@ -365,9 +367,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff)) goto fail; ip_send_check(ip_hdr(skb)); @@ -559,6 +559,22 @@ nla_put_failure: return -1; } +static int tcf_csum_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -566,6 +582,29 @@ static struct tc_action_ops act_csum_ops = { .act = tcf_csum, .dump = tcf_csum_dump, .init = tcf_csum_init, + .walk = tcf_csum_walker, + .lookup = tcf_csum_search, +}; + +static __net_init int csum_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK); +} + +static void __net_exit csum_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations csum_net_ops = { + .init = csum_init_net, + .exit = csum_exit_net, + .id = &csum_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Checksum updating actions"); @@ -573,12 +612,12 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { - return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); + return tcf_register_action(&act_csum_ops, &csum_net_ops); } static void __exit csum_cleanup_module(void) { - tcf_unregister_action(&act_csum_ops); + tcf_unregister_action(&act_csum_ops, &csum_net_ops); } module_init(csum_init_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 5c1b05170736..887fc1f209ff 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -25,6 +25,8 @@ #define GACT_TAB_MASK 15 +static int gact_net_id; + #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { @@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; @@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, true); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*gact), bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -183,6 +186,22 @@ nla_put_failure: return -1; } +static int tcf_gact_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = { .act = tcf_gact, .dump = tcf_gact_dump, .init = tcf_gact_init, + .walk = tcf_gact_walker, + .lookup = tcf_gact_search, +}; + +static __net_init int gact_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK); +} + +static void __net_exit gact_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations gact_net_ops = { + .init = gact_init_net, + .exit = gact_exit_net, + .id = &gact_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -203,12 +245,13 @@ static int __init gact_init_module(void) #else pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); + + return tcf_register_action(&act_gact_ops, &gact_net_ops); } static void __exit gact_cleanup_module(void) { - tcf_unregister_action(&act_gact_ops); + tcf_unregister_action(&act_gact_ops, &gact_net_ops); } module_init(gact_init_module); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c new file mode 100644 index 000000000000..c589a9ba506a --- /dev/null +++ b/net/sched/act_ife.c @@ -0,0 +1,870 @@ +/* + * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB + * + * Refer to: + * draft-ietf-forces-interfelfb-03 + * and + * netdev01 paper: + * "Distributing Linux Traffic Control Classifier-Action + * Subsystem" + * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> +#include <linux/etherdevice.h> + +#define IFE_TAB_MASK 15 + +static int ife_net_id; +static int max_metacnt = IFE_META_MAX + 1; + +static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { + [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, + [TCA_IFE_DMAC] = { .len = ETH_ALEN}, + [TCA_IFE_SMAC] = { .len = ETH_ALEN}, + [TCA_IFE_TYPE] = { .type = NLA_U16}, +}; + +/* Caller takes care of presenting data in network order +*/ +int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) +{ + u32 *tlv = (u32 *)(skbdata); + u16 totlen = nla_total_size(dlen); /*alignment + hdr */ + char *dptr = (char *)tlv + NLA_HDRLEN; + u32 htlv = attrtype << 16 | totlen; + + *tlv = htonl(htlv); + memset(dptr, 0, totlen - NLA_HDRLEN); + memcpy(dptr, dval, dlen); + + return totlen; +} +EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); + +int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) +{ + if (mi->metaval) + return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); + else + return nla_put(skb, mi->metaid, 0, NULL); +} +EXPORT_SYMBOL_GPL(ife_get_meta_u32); + +int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) +{ + if (metaval || mi->metaval) + return 8; /* T+L+V == 2+2+4 */ + + return 0; +} +EXPORT_SYMBOL_GPL(ife_check_meta_u32); + +int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) +{ + u32 edata = metaval; + + if (mi->metaval) + edata = *(u32 *)mi->metaval; + else if (metaval) + edata = metaval; + + if (!edata) /* will not encode */ + return 0; + + edata = htonl(edata); + return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); +} +EXPORT_SYMBOL_GPL(ife_encode_meta_u32); + +int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) +{ + if (mi->metaval) + return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); + else + return nla_put(skb, mi->metaid, 0, NULL); +} +EXPORT_SYMBOL_GPL(ife_get_meta_u16); + +int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval) +{ + mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL); + if (!mi->metaval) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); + +int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval) +{ + mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL); + if (!mi->metaval) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); + +void ife_release_meta_gen(struct tcf_meta_info *mi) +{ + kfree(mi->metaval); +} +EXPORT_SYMBOL_GPL(ife_release_meta_gen); + +int ife_validate_meta_u32(void *val, int len) +{ + if (len == 4) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ife_validate_meta_u32); + +int ife_validate_meta_u16(void *val, int len) +{ + /* length will include padding */ + if (len == NLA_ALIGN(2)) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ife_validate_meta_u16); + +static LIST_HEAD(ifeoplist); +static DEFINE_RWLOCK(ife_mod_lock); + +static struct tcf_meta_ops *find_ife_oplist(u16 metaid) +{ + struct tcf_meta_ops *o; + + read_lock(&ife_mod_lock); + list_for_each_entry(o, &ifeoplist, list) { + if (o->metaid == metaid) { + if (!try_module_get(o->owner)) + o = NULL; + read_unlock(&ife_mod_lock); + return o; + } + } + read_unlock(&ife_mod_lock); + + return NULL; +} + +int register_ife_op(struct tcf_meta_ops *mops) +{ + struct tcf_meta_ops *m; + + if (!mops->metaid || !mops->metatype || !mops->name || + !mops->check_presence || !mops->encode || !mops->decode || + !mops->get || !mops->alloc) + return -EINVAL; + + write_lock(&ife_mod_lock); + + list_for_each_entry(m, &ifeoplist, list) { + if (m->metaid == mops->metaid || + (strcmp(mops->name, m->name) == 0)) { + write_unlock(&ife_mod_lock); + return -EEXIST; + } + } + + if (!mops->release) + mops->release = ife_release_meta_gen; + + list_add_tail(&mops->list, &ifeoplist); + write_unlock(&ife_mod_lock); + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ife_op); + +int unregister_ife_op(struct tcf_meta_ops *mops) +{ + struct tcf_meta_ops *m; + int err = -ENOENT; + + write_lock(&ife_mod_lock); + list_for_each_entry(m, &ifeoplist, list) { + if (m->metaid == mops->metaid) { + list_del(&mops->list); + err = 0; + break; + } + } + write_unlock(&ife_mod_lock); + + return err; +} +EXPORT_SYMBOL_GPL(register_ife_op); + +static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) +{ + int ret = 0; + /* XXX: unfortunately cant use nla_policy at this point + * because a length of 0 is valid in the case of + * "allow". "use" semantics do enforce for proper + * length and i couldve use nla_policy but it makes it hard + * to use it just for that.. + */ + if (ops->validate) + return ops->validate(val, len); + + if (ops->metatype == NLA_U32) + ret = ife_validate_meta_u32(val, len); + else if (ops->metatype == NLA_U16) + ret = ife_validate_meta_u16(val, len); + + return ret; +} + +/* called when adding new meta information + * under ife->tcf_lock +*/ +static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, + void *val, int len) +{ + struct tcf_meta_ops *ops = find_ife_oplist(metaid); + int ret = 0; + + if (!ops) { + ret = -ENOENT; +#ifdef CONFIG_MODULES + spin_unlock_bh(&ife->tcf_lock); + rtnl_unlock(); + request_module("ifemeta%u", metaid); + rtnl_lock(); + spin_lock_bh(&ife->tcf_lock); + ops = find_ife_oplist(metaid); +#endif + } + + if (ops) { + ret = 0; + if (len) + ret = ife_validate_metatype(ops, val, len); + + module_put(ops->owner); + } + + return ret; +} + +/* called when adding new meta information + * under ife->tcf_lock +*/ +static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, + int len) +{ + struct tcf_meta_info *mi = NULL; + struct tcf_meta_ops *ops = find_ife_oplist(metaid); + int ret = 0; + + if (!ops) + return -ENOENT; + + mi = kzalloc(sizeof(*mi), GFP_KERNEL); + if (!mi) { + /*put back what find_ife_oplist took */ + module_put(ops->owner); + return -ENOMEM; + } + + mi->metaid = metaid; + mi->ops = ops; + if (len > 0) { + ret = ops->alloc(mi, metaval); + if (ret != 0) { + kfree(mi); + module_put(ops->owner); + return ret; + } + } + + list_add_tail(&mi->metalist, &ife->metalist); + + return ret; +} + +static int use_all_metadata(struct tcf_ife_info *ife) +{ + struct tcf_meta_ops *o; + int rc = 0; + int installed = 0; + + list_for_each_entry(o, &ifeoplist, list) { + rc = add_metainfo(ife, o->metaid, NULL, 0); + if (rc == 0) + installed += 1; + } + + if (installed) + return 0; + else + return -EINVAL; +} + +static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) +{ + struct tcf_meta_info *e; + struct nlattr *nest; + unsigned char *b = skb_tail_pointer(skb); + int total_encoded = 0; + + /*can only happen on decode */ + if (list_empty(&ife->metalist)) + return 0; + + nest = nla_nest_start(skb, TCA_IFE_METALST); + if (!nest) + goto out_nlmsg_trim; + + list_for_each_entry(e, &ife->metalist, metalist) { + if (!e->ops->get(skb, e)) + total_encoded += 1; + } + + if (!total_encoded) + goto out_nlmsg_trim; + + nla_nest_end(skb, nest); + + return 0; + +out_nlmsg_trim: + nlmsg_trim(skb, b); + return -1; +} + +/* under ife->tcf_lock */ +static void _tcf_ife_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ife_info *ife = a->priv; + struct tcf_meta_info *e, *n; + + list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + module_put(e->ops->owner); + list_del(&e->metalist); + if (e->metaval) { + if (e->ops->release) + e->ops->release(e); + else + kfree(e->metaval); + } + kfree(e); + } +} + +static void tcf_ife_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ife_info *ife = a->priv; + + spin_lock_bh(&ife->tcf_lock); + _tcf_ife_cleanup(a, bind); + spin_unlock_bh(&ife->tcf_lock); +} + +/* under ife->tcf_lock */ +static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb) +{ + int len = 0; + int rc = 0; + int i = 0; + void *val; + + for (i = 1; i < max_metacnt; i++) { + if (tb[i]) { + val = nla_data(tb[i]); + len = nla_len(tb[i]); + + rc = load_metaops_and_vet(ife, i, val, len); + if (rc != 0) + return rc; + + rc = add_metainfo(ife, i, val, len); + if (rc) + return rc; + } + } + + return rc; +} + +static int tcf_ife_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + struct nlattr *tb[TCA_IFE_MAX + 1]; + struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_info *ife; + struct tc_ife *parm; + u16 ife_type = 0; + u8 *daddr = NULL; + u8 *saddr = NULL; + int ret = 0; + int err; + + err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); + if (err < 0) + return err; + + if (!tb[TCA_IFE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_IFE_PARMS]); + + if (parm->flags & IFE_ENCODE) { + /* Until we get issued the ethertype, we cant have + * a default.. + **/ + if (!tb[TCA_IFE_TYPE]) { + pr_info("You MUST pass etherype for encoding\n"); + return -EINVAL; + } + } + + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), + bind, false); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + if (bind) /* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + + ife = to_ife(a); + ife->flags = parm->flags; + + if (parm->flags & IFE_ENCODE) { + ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); + if (tb[TCA_IFE_DMAC]) + daddr = nla_data(tb[TCA_IFE_DMAC]); + if (tb[TCA_IFE_SMAC]) + saddr = nla_data(tb[TCA_IFE_SMAC]); + } + + spin_lock_bh(&ife->tcf_lock); + ife->tcf_action = parm->action; + + if (parm->flags & IFE_ENCODE) { + if (daddr) + ether_addr_copy(ife->eth_dst, daddr); + else + eth_zero_addr(ife->eth_dst); + + if (saddr) + ether_addr_copy(ife->eth_src, saddr); + else + eth_zero_addr(ife->eth_src); + + ife->eth_type = ife_type; + } + + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&ife->metalist); + + if (tb[TCA_IFE_METALST]) { + err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], + NULL); + if (err) { +metadata_parse_err: + if (ret == ACT_P_CREATED) + _tcf_ife_cleanup(a, bind); + + spin_unlock_bh(&ife->tcf_lock); + return err; + } + + err = populate_metalist(ife, tb2); + if (err) + goto metadata_parse_err; + + } else { + /* if no passed metadata allow list or passed allow-all + * then here we process by adding as many supported metadatum + * as we can. You better have at least one else we are + * going to bail out + */ + err = use_all_metadata(ife); + if (err) { + if (ret == ACT_P_CREATED) + _tcf_ife_cleanup(a, bind); + + spin_unlock_bh(&ife->tcf_lock); + return err; + } + } + + spin_unlock_bh(&ife->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, a); + + return ret; +} + +static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ife_info *ife = a->priv; + struct tc_ife opt = { + .index = ife->tcf_index, + .refcnt = ife->tcf_refcnt - ref, + .bindcnt = ife->tcf_bindcnt - bind, + .action = ife->tcf_action, + .flags = ife->flags, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); + if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t)) + goto nla_put_failure; + + if (!is_zero_ether_addr(ife->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + goto nla_put_failure; + } + + if (!is_zero_ether_addr(ife->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + goto nla_put_failure; + } + + if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + goto nla_put_failure; + + if (dump_metalist(skb, ife)) { + /*ignore failure to dump metalist */ + pr_info("Failed to dump metalist\n"); + } + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, + u16 metaid, u16 mlen, void *mdata) +{ + struct tcf_meta_info *e; + + /* XXX: use hash to speed up */ + list_for_each_entry(e, &ife->metalist, metalist) { + if (metaid == e->metaid) { + if (e->ops) { + /* We check for decode presence already */ + return e->ops->decode(skb, mdata, mlen); + } + } + } + + return 0; +} + +struct ifeheadr { + __be16 metalen; + u8 tlv_data[]; +}; + +struct meta_tlvhdr { + __be16 type; + __be16 len; +}; + +static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + int action = ife->tcf_action; + struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; + u16 ifehdrln = ifehdr->metalen; + struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); + + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + spin_unlock(&ife->tcf_lock); + + ifehdrln = ntohs(ifehdrln); + if (unlikely(!pskb_may_pull(skb, ifehdrln))) { + spin_lock(&ife->tcf_lock); + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + skb_set_mac_header(skb, ifehdrln); + __skb_pull(skb, ifehdrln); + skb->protocol = eth_type_trans(skb, skb->dev); + ifehdrln -= IFE_METAHDRLEN; + + while (ifehdrln > 0) { + u8 *tlvdata = (u8 *)tlv; + u16 mtype = tlv->type; + u16 mlen = tlv->len; + + mtype = ntohs(mtype); + mlen = ntohs(mlen); + + if (find_decode_metaid(skb, ife, mtype, (mlen - 4), + (void *)(tlvdata + 4))) { + /* abuse overlimits to count when we receive metadata + * but dont have an ops for it + */ + pr_info_ratelimited("Unknown metaid %d alnlen %d\n", + mtype, mlen); + ife->tcf_qstats.overlimits++; + } + + tlvdata += mlen; + ifehdrln -= mlen; + tlv = (struct meta_tlvhdr *)tlvdata; + } + + skb_reset_network_header(skb); + return action; +} + +/*XXX: check if we can do this at install time instead of current + * send data path +**/ +static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) +{ + struct tcf_meta_info *e, *n; + int tot_run_sz = 0, run_sz = 0; + + list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + if (e->ops->check_presence) { + run_sz = e->ops->check_presence(skb, e); + tot_run_sz += run_sz; + } + } + + return tot_run_sz; +} + +static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + int action = ife->tcf_action; + struct ethhdr *oethh; /* outer ether header */ + struct ethhdr *iethh; /* inner eth header */ + struct tcf_meta_info *e; + /* + OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA + where ORIGDATA = original ethernet header ... + */ + u16 metalen = ife_get_sz(skb, ife); + int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; + unsigned int skboff = skb->dev->hard_header_len; + u32 at = G_TC_AT(skb->tc_verd); + int new_len = skb->len + hdrm; + bool exceed_mtu = false; + int err; + + if (at & AT_EGRESS) { + if (new_len > skb->dev->mtu) + exceed_mtu = true; + } + + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + + if (!metalen) { /* no metadata to send */ + /* abuse overlimits to count when we allow packet + * with no metadata + */ + ife->tcf_qstats.overlimits++; + spin_unlock(&ife->tcf_lock); + return action; + } + /* could be stupid policy setup or mtu config + * so lets be conservative.. */ + if ((action == TC_ACT_SHOT) || exceed_mtu) { + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + iethh = eth_hdr(skb); + + err = skb_cow_head(skb, hdrm); + if (unlikely(err)) { + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + if (!(at & AT_EGRESS)) + skb_push(skb, skb->dev->hard_header_len); + + __skb_push(skb, hdrm); + memcpy(skb->data, iethh, skb->mac_len); + skb_reset_mac_header(skb); + oethh = eth_hdr(skb); + + /*total metadata length */ + metalen += IFE_METAHDRLEN; + metalen = htons(metalen); + memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN); + skboff += IFE_METAHDRLEN; + + /* XXX: we dont have a clever way of telling encode to + * not repeat some of the computations that are done by + * ops->presence_check... + */ + list_for_each_entry(e, &ife->metalist, metalist) { + if (e->ops->encode) { + err = e->ops->encode(skb, (void *)(skb->data + skboff), + e); + } + if (err < 0) { + /* too corrupt to keep around if overwritten */ + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + skboff += err; + } + + if (!is_zero_ether_addr(ife->eth_src)) + ether_addr_copy(oethh->h_source, ife->eth_src); + else + ether_addr_copy(oethh->h_source, iethh->h_source); + if (!is_zero_ether_addr(ife->eth_dst)) + ether_addr_copy(oethh->h_dest, ife->eth_dst); + else + ether_addr_copy(oethh->h_dest, iethh->h_dest); + oethh->h_proto = htons(ife->eth_type); + + if (!(at & AT_EGRESS)) + skb_pull(skb, skb->dev->hard_header_len); + + spin_unlock(&ife->tcf_lock); + + return action; +} + +static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + + if (ife->flags & IFE_ENCODE) + return tcf_ife_encode(skb, a, res); + + if (!(ife->flags & IFE_ENCODE)) + return tcf_ife_decode(skb, a, res); + + pr_info_ratelimited("unknown failure(policy neither de/encode\n"); + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + + return TC_ACT_SHOT; +} + +static int tcf_ife_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_ife_ops = { + .kind = "ife", + .type = TCA_ACT_IFE, + .owner = THIS_MODULE, + .act = tcf_ife_act, + .dump = tcf_ife_dump, + .cleanup = tcf_ife_cleanup, + .init = tcf_ife_init, + .walk = tcf_ife_walker, + .lookup = tcf_ife_search, +}; + +static __net_init int ife_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK); +} + +static void __net_exit ife_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations ife_net_ops = { + .init = ife_init_net, + .exit = ife_exit_net, + .id = &ife_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ife_init_module(void) +{ + return tcf_register_action(&act_ife_ops, &ife_net_ops); +} + +static void __exit ife_cleanup_module(void) +{ + tcf_unregister_action(&act_ife_ops, &ife_net_ops); +} + +module_init(ife_init_module); +module_exit(ife_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE LFB action"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d05869646515..350e134cffb3 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -30,6 +30,10 @@ #define IPT_TAB_MASK 15 +static int ipt_net_id; + +static int xt_net_id; + static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; @@ -62,6 +66,7 @@ static void ipt_destroy_target(struct xt_entry_target *t) struct xt_tgdtor_param par = { .target = t->u.kernel.target, .targinfo = t->data, + .family = NFPROTO_IPV4, }; if (par.target->destroy != NULL) par.target->destroy(&par); @@ -83,8 +88,9 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; @@ -113,8 +119,9 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); + if (!tcf_hash_check(tn, index, a, bind)) { + ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + false); if (ret) return ret; ret = ACT_P_CREATED; @@ -157,7 +164,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; err3: @@ -170,6 +177,24 @@ err1: return err; } +static int tcf_ipt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + +static int tcf_xt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -195,6 +220,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, par.hooknum = ipt->tcfi_hook; par.target = ipt->tcfi_t->u.kernel.target; par.targinfo = ipt->tcfi_t->data; + par.family = NFPROTO_IPV4; ret = par.target->target(skb, &par); switch (ret) { @@ -260,6 +286,22 @@ nla_put_failure: return -1; } +static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -268,8 +310,47 @@ static struct tc_action_ops act_ipt_ops = { .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, + .walk = tcf_ipt_walker, + .lookup = tcf_ipt_search, +}; + +static __net_init int ipt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK); +} + +static void __net_exit ipt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations ipt_net_ops = { + .init = ipt_init_net, + .exit = ipt_exit_net, + .id = &ipt_net_id, + .size = sizeof(struct tc_action_net), }; +static int tcf_xt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -277,7 +358,30 @@ static struct tc_action_ops act_xt_ops = { .act = tcf_ipt, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, - .init = tcf_ipt_init, + .init = tcf_xt_init, + .walk = tcf_xt_walker, + .lookup = tcf_xt_search, +}; + +static __net_init int xt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK); +} + +static void __net_exit xt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations xt_net_ops = { + .init = xt_init_net, + .exit = xt_exit_net, + .id = &xt_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); @@ -289,12 +393,13 @@ static int __init ipt_init_module(void) { int ret1, ret2; - ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); + ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); if (ret1 < 0) - printk("Failed to load xt action\n"); - ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); + pr_err("Failed to load xt action\n"); + + ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); if (ret2 < 0) - printk("Failed to load ipt action\n"); + pr_err("Failed to load ipt action\n"); if (ret1 < 0 && ret2 < 0) { return ret1; @@ -304,8 +409,8 @@ static int __init ipt_init_module(void) static void __exit ipt_cleanup_module(void) { - tcf_unregister_action(&act_xt_ops); - tcf_unregister_action(&act_ipt_ops); + tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); + tcf_unregister_action(&act_xt_ops, &xt_net_ops); } module_init(ipt_init_module); diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c new file mode 100644 index 000000000000..82892170ce4f --- /dev/null +++ b/net/sched/act_meta_mark.c @@ -0,0 +1,79 @@ +/* + * net/sched/act_meta_mark.c IFE skb->mark metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> +#include <linux/rtnetlink.h> + +static int skbmark_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifemark = skb->mark; + + return ife_encode_meta_u32(ifemark, skbdata, e); +} + +static int skbmark_decode(struct sk_buff *skb, void *data, u16 len) +{ + u32 ifemark = *(u32 *)data; + + skb->mark = ntohl(ifemark); + return 0; +} + +static int skbmark_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u32(skb->mark, e); +} + +static struct tcf_meta_ops ife_skbmark_ops = { + .metaid = IFE_META_SKBMARK, + .metatype = NLA_U32, + .name = "skbmark", + .synopsis = "skb mark 32 bit metadata", + .check_presence = skbmark_check, + .encode = skbmark_encode, + .decode = skbmark_decode, + .get = ife_get_meta_u32, + .alloc = ife_alloc_meta_u32, + .release = ife_release_meta_gen, + .validate = ife_validate_meta_u32, + .owner = THIS_MODULE, +}; + +static int __init ifemark_init_module(void) +{ + return register_ife_op(&ife_skbmark_ops); +} + +static void __exit ifemark_cleanup_module(void) +{ + unregister_ife_op(&ife_skbmark_ops); +} + +module_init(ifemark_init_module); +module_exit(ifemark_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c new file mode 100644 index 000000000000..26bf4d86030b --- /dev/null +++ b/net/sched/act_meta_skbprio.c @@ -0,0 +1,76 @@ +/* + * net/sched/act_meta_prio.c IFE skb->priority metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> + +static int skbprio_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u32(skb->priority, e); +} + +static int skbprio_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifeprio = skb->priority; /* avoid having to cast skb->priority*/ + + return ife_encode_meta_u32(ifeprio, skbdata, e); +} + +static int skbprio_decode(struct sk_buff *skb, void *data, u16 len) +{ + u32 ifeprio = *(u32 *)data; + + skb->priority = ntohl(ifeprio); + return 0; +} + +static struct tcf_meta_ops ife_prio_ops = { + .metaid = IFE_META_PRIO, + .metatype = NLA_U32, + .name = "skbprio", + .synopsis = "skb prio metadata", + .check_presence = skbprio_check, + .encode = skbprio_encode, + .decode = skbprio_decode, + .get = ife_get_meta_u32, + .alloc = ife_alloc_meta_u32, + .owner = THIS_MODULE, +}; + +static int __init ifeprio_init_module(void) +{ + return register_ife_op(&ife_prio_ops); +} + +static void __exit ifeprio_cleanup_module(void) +{ + unregister_ife_op(&ife_prio_ops); +} + +module_init(ifeprio_init_module); +module_exit(ifeprio_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_PRIO); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 32fcdecdb9e2..e8a760cf7775 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -50,10 +50,13 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; +static int mirred_net_id; + static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; @@ -96,11 +99,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev = NULL; } - if (!tcf_hash_check(parm->index, a, bind)) { + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), - bind, true); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*m), bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -130,7 +133,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(a); + tcf_hash_insert(tn, a); } return ret; @@ -179,7 +182,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; - skb_sender_cpu_clear(skb2); err = dev_queue_xmit(skb2); if (err) { @@ -221,6 +223,22 @@ nla_put_failure: return -1; } +static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_hash_search(tn, a, index); +} + static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -257,6 +275,29 @@ static struct tc_action_ops act_mirred_ops = { .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, + .walk = tcf_mirred_walker, + .lookup = tcf_mirred_search, +}; + +static __net_init int mirred_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK); +} + +static void __net_exit mirred_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations mirred_net_ops = { + .init = mirred_init_net, + .exit = mirred_exit_net, + .id = &mirred_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -270,12 +311,12 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); + return tcf_register_action(&act_mirred_ops, &mirred_net_ops); } static void __exit mirred_cleanup_module(void) { - tcf_unregister_action(&act_mirred_ops); + tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b7c4ead8b5a8..0f65cdfbfb1d 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -31,6 +31,8 @@ #define NAT_TAB_MASK 15 +static int nat_net_id; + static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; @@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; int ret = 0, err; @@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -126,9 +129,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, addr = iph->daddr; if (!((old_addr ^ addr) & mask)) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + noff)) goto drop; new_addr &= mask; @@ -156,9 +157,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcphdr *tcph; if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff)) goto drop; tcph = (void *)(skb_network_header(skb) + ihl); @@ -171,9 +170,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct udphdr *udph; if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*udph) + noff)) goto drop; udph = (void *)(skb_network_header(skb) + ihl); @@ -213,10 +210,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, if ((old_addr ^ addr) & mask) break; - if (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*icmph) + - sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); @@ -282,6 +277,22 @@ nla_put_failure: return -1; } +static int tcf_nat_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -289,6 +300,29 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat, .dump = tcf_nat_dump, .init = tcf_nat_init, + .walk = tcf_nat_walker, + .lookup = tcf_nat_search, +}; + +static __net_init int nat_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK); +} + +static void __net_exit nat_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations nat_net_ops = { + .init = nat_init_net, + .exit = nat_exit_net, + .id = &nat_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Stateless NAT actions"); @@ -296,12 +330,12 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { - return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); + return tcf_register_action(&act_nat_ops, &nat_net_ops); } static void __exit nat_cleanup_module(void) { - tcf_unregister_action(&act_nat_ops); + tcf_unregister_action(&act_nat_ops, &nat_net_ops); } module_init(nat_init_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e38a7701f154..429c3ab65142 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -25,6 +25,8 @@ #define PEDIT_TAB_MASK 15 +static int pedit_net_id; + static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; @@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; int ret = 0, err; @@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - if (!tcf_hash_check(parm->index, a, bind)) { + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; p = to_pedit(a); @@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -211,6 +214,22 @@ nla_put_failure: return -1; } +static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = { .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, + .walk = tcf_pedit_walker, + .lookup = tcf_pedit_search, +}; + +static __net_init int pedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK); +} + +static void __net_exit pedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations pedit_net_ops = { + .init = pedit_init_net, + .exit = pedit_exit_net, + .id = &pedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -227,12 +269,12 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); + return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { - tcf_unregister_action(&act_pedit_ops); + tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 9a1c42a43f92..330f14e302e8 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -55,10 +55,14 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ -static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +static int police_net_id; + +static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; int size; if (nla == NULL) @@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - if (tcf_hash_search(a, parm->index)) { + if (tcf_hash_search(tn, a, parm->index)) { police = to_police(a->priv); if (bind) { police->tcf_bindcnt += 1; @@ -233,7 +238,7 @@ override: police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(hinfo); + tcf_hash_new_index(tn); h = tcf_hash(police->tcf_index, POL_TAB_MASK); spin_lock_bh(&hinfo->lock); hlist_add_head(&police->tcf_head, &hinfo->htab[h]); @@ -342,6 +347,13 @@ nla_put_failure: return -1; } +static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_hash_search(tn, a, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = { .act = tcf_act_police, .dump = tcf_act_police_dump, .init = tcf_act_police_locate, - .walk = tcf_act_police_walker + .walk = tcf_act_police_walker, + .lookup = tcf_police_search, +}; + +static __net_init int police_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK); +} + +static void __net_exit police_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations police_net_ops = { + .init = police_init_net, + .exit = police_exit_net, + .id = &police_net_id, + .size = sizeof(struct tc_action_net), }; static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops, POL_TAB_MASK); + return tcf_register_action(&act_police_ops, &police_net_ops); } static void __exit police_cleanup_module(void) { - tcf_unregister_action(&act_police_ops); + tcf_unregister_action(&act_police_ops, &police_net_ops); } module_init(police_init_module); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index d6b708d6afdf..75b2be13fbcc 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -26,6 +26,8 @@ #define SIMP_TAB_MASK 7 +static int simp_net_id; + #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -80,6 +82,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; @@ -102,9 +105,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_DEF_PARMS]); defdata = nla_data(tb[TCA_DEF_DATA]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -129,7 +132,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -161,6 +164,22 @@ nla_put_failure: return -1; } +static int tcf_simp_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -169,6 +188,29 @@ static struct tc_action_ops act_simp_ops = { .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, + .walk = tcf_simp_walker, + .lookup = tcf_simp_search, +}; + +static __net_init int simp_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK); +} + +static void __net_exit simp_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations simp_net_ops = { + .init = simp_init_net, + .exit = simp_exit_net, + .id = &simp_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -177,8 +219,7 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret; - ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); + int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; @@ -186,7 +227,7 @@ static int __init simp_init_module(void) static void __exit simp_cleanup_module(void) { - tcf_unregister_action(&act_simp_ops); + tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6751b5f8c046..cfcdbdc00c9b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -29,6 +29,8 @@ #define SKBEDIT_TAB_MASK 15 +static int skbedit_net_id; + static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -61,6 +63,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -98,9 +101,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -130,7 +133,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -173,6 +176,22 @@ nla_put_failure: return -1; } +static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -180,6 +199,29 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .walk = tcf_skbedit_walker, + .lookup = tcf_skbedit_search, +}; + +static __net_init int skbedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK); +} + +static void __net_exit skbedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations skbedit_net_ops = { + .init = skbedit_init_net, + .exit = skbedit_exit_net, + .id = &skbedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -188,12 +230,12 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { - return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); + return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { - tcf_unregister_action(&act_skbedit_ops); + tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 796785e0bf96..bab8ae0cefc0 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -21,6 +21,8 @@ #define VLAN_TAB_MASK 15 +static int vlan_net_id; + static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -68,6 +70,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tc_vlan *parm; struct tcf_vlan *v; @@ -115,9 +118,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } action = parm->v_action; - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*v), bind, false); if (ret) return ret; @@ -143,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -181,6 +184,22 @@ nla_put_failure: return -1; } +static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -188,16 +207,39 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .walk = tcf_vlan_walker, + .lookup = tcf_vlan_search, +}; + +static __net_init int vlan_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK); +} + +static void __net_exit vlan_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations vlan_net_ops = { + .init = vlan_init_net, + .exit = vlan_exit_net, + .id = &vlan_net_id, + .size = sizeof(struct tc_action_net), }; static int __init vlan_init_module(void) { - return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK); + return tcf_register_action(&act_vlan_ops, &vlan_net_ops); } static void __exit vlan_cleanup_module(void) { - tcf_unregister_action(&act_vlan_ops); + tcf_unregister_action(&act_vlan_ops, &vlan_net_ops); } module_init(vlan_init_module); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 95b021243233..2181ffc76638 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head) kfree(f); } +static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, 0)) + return; + + offload.command = TC_CLSFLOWER_DESTROY; + offload.cookie = cookie; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + +static void fl_hw_replace_filter(struct tcf_proto *tp, + struct flow_dissector *dissector, + struct fl_flow_key *mask, + struct fl_flow_key *key, + struct tcf_exts *actions, + unsigned long cookie, u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, flags)) + return; + + offload.command = TC_CLSFLOWER_REPLACE; + offload.cookie = cookie; + offload.dissector = dissector; + offload.mask = mask; + offload.key = key; + offload.exts = actions; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + static bool fl_destroy(struct tcf_proto *tp, bool force) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(f, next, &head->filters, list) { + fl_hw_destroy_filter(tp, (unsigned long)f); list_del_rcu(&f->list); call_rcu(&f->rcu, fl_destroy_filter); } @@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr *tb[TCA_FLOWER_MAX + 1]; struct fl_flow_mask mask = {}; + u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } fnew->handle = handle; + if (tb[TCA_FLOWER_FLAGS]) + flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) goto errout; @@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, head->ht_params); if (err) goto errout; - if (fold) + + fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + &fnew->key, + &fnew->exts, + (unsigned long)fnew, + flags); + + if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); + fl_hw_destroy_filter(tp, (unsigned long)fold); + } *arg = (unsigned long) fnew; @@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); list_del_rcu(&f->list); + fl_hw_destroy_filter(tp, (unsigned long)f); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fl_destroy_filter); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4fbb67430ce4..563cdad76448 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -43,6 +43,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <linux/netdevice.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -58,6 +59,7 @@ struct tc_u_knode { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif + u32 flags; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -424,6 +426,97 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_KNODE; + offload.cls_u32->knode.handle = handle; + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_hnode(struct tcf_proto *tp, + struct tc_u_hnode *h, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_NEW_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_knode(struct tcf_proto *tp, + struct tc_u_knode *n, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; + offload.cls_u32->knode.handle = n->handle; + offload.cls_u32->knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + offload.cls_u32->knode.val = n->val; + offload.cls_u32->knode.mask = n->mask; +#else + offload.cls_u32->knode.val = 0; + offload.cls_u32->knode.mask = 0; +#endif + offload.cls_u32->knode.sel = &n->sel; + offload.cls_u32->knode.exts = &n->exts; + if (n->ht_down) + offload.cls_u32->knode.link_handle = n->ht_down->handle; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; @@ -434,6 +527,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); + u32_remove_hw_knode(tp, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -454,6 +548,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { + u32_clear_hw_hnode(tp, ht); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -540,8 +635,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (ht == NULL) return 0; - if (TC_U32_KEY(ht->handle)) + if (TC_U32_KEY(ht->handle)) { + u32_remove_hw_knode(tp, ht->handle); return u32_delete_key(tp, (struct tc_u_knode *)ht); + } if (root_ht == ht) return -EINVAL; @@ -587,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, + [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; static int u32_set_parms(struct net *net, struct tcf_proto *tp, @@ -694,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, #endif new->fshift = n->fshift; new->res = n->res; + new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, n->ht_down); /* bump reference count as long as we hold pointer to structure */ @@ -733,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; - u32 htid; + u32 htid, flags = 0; int err; #ifdef CONFIG_CLS_U32_PERF size_t size; @@ -746,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; + if (tb[TCA_U32_FLAGS]) + flags = nla_get_u32(tb[TCA_U32_FLAGS]); + n = (struct tc_u_knode *)*arg; if (n) { struct tc_u_knode *new; @@ -753,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; + if (n->flags != flags) + return -EINVAL; + new = u32_init_knode(tp, n); if (!new) return -ENOMEM; @@ -769,6 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); + u32_replace_hw_knode(tp, new, flags); return 0; } @@ -795,6 +901,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; + + u32_replace_hw_hnode(tp, ht, flags); return 0; } @@ -845,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + n->flags = flags; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; @@ -877,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); - + u32_replace_hw_knode(tp, n, flags); *arg = (unsigned long)n; return 0; } @@ -982,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; + if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags)) + goto nla_put_failure; + #ifdef CONFIG_CLS_U32_MARK if ((n->val || n->mask)) { struct tc_u32_mark mark = {.val = n->val, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b5c2cf2aa6d4..3b180ff72f79 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -744,14 +744,15 @@ static u32 qdisc_alloc_handle(struct net_device *dev) return 0; } -void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) +void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, + unsigned int len) { const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; int drops; - if (n == 0) + if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); @@ -774,11 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; + sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } -EXPORT_SYMBOL(qdisc_tree_decrease_qlen); +EXPORT_SYMBOL(qdisc_tree_reduce_backlog); static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, @@ -1841,7 +1843,7 @@ reclassify: return err; } - return -1; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= MAX_REC_LOOP)) { @@ -1852,6 +1854,7 @@ reset: } tp = old_tp; + protocol = tc_skb_protocol(skb); goto reclassify; #endif } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c538d9e4a8f6..baafddf229ce 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new->reshape_fail = cbq_reshape_fail; #endif } - sch_tree_lock(sch); - *old = cl->q; - cl->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->q); return 0; } @@ -1914,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - unsigned int qlen; + unsigned int qlen, backlog; if (cl->filters || cl->children || cl == &q->link) return -EBUSY; @@ -1922,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); qlen = cl->q->q.qlen; + backlog = cl->q->qstats.backlog; qdisc_reset(cl->q); - qdisc_tree_decrease_qlen(cl->q, qlen); + qdisc_tree_reduce_backlog(cl->q, qlen, backlog); if (cl->next_alive) cbq_deactivate_class(cl); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 5ffb8b8337c7..0a08c860eee4 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) choke_zap_tail_holes(q); qdisc_qstats_backlog_dec(sch, skb); + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch); - qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; } @@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) old = q->tab; if (old) { unsigned int oqlen = sch->q.qlen, tail = 0; + unsigned dropped = 0; while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; @@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; q->tail = tail; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 535007d5f0b5..9b7e2980ee5c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->stats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; + q->stats.drop_len = 0; } if (skb) qdisc_bstats_update(sch, skb); @@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) { struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index a1cd778240cd..a63e879e8975 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) static void drr_purge_queue(struct drr_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { @@ -226,11 +227,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - drr_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index f357f34d02d2..34b4ddaca27c 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - *old = p->q; - p->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &p->q); return 0; } @@ -264,6 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -281,11 +276,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - skb = p->q->ops->dequeue(p->q); + skb = qdisc_dequeue_peeked(p->q); if (skb == NULL) return NULL; qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); @@ -401,6 +397,7 @@ static void dsmark_reset(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 109b2322778f..3c6a47d66a04 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; int err, drop_count = 0; + unsigned drop_len = 0; u32 fq_log; if (!opt) @@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); drop_count++; } - qdisc_tree_decrease_qlen(sch, drop_count); + qdisc_tree_reduce_backlog(sch, drop_count, drop_len); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4c834e93dafb..d3fc8f9dd3d4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - unsigned int idx; + unsigned int idx, prev_backlog; struct fq_codel_flow *flow; int uninitialized_var(ret); @@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet * from this flow. @@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) struct fq_codel_flow *flow; struct list_head *head; u32 prev_drop_count, prev_ecn_mark; + unsigned int prev_backlog; begin: head = &q->new_flows; @@ -259,6 +261,7 @@ begin: prev_drop_count = q->cstats.drop_count; prev_ecn_mark = q->cstats.ecn_mark; + prev_backlog = sch->qstats.backlog; skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, dequeue); @@ -276,12 +279,14 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->cstats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, + q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; } return skb; } @@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_codel_dequeue(sch); + q->cstats.drop_len += qdisc_pkt_len(skb); kfree_skb(skb); q->cstats.drop_count++; } - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 16bc83b2842a..f18c35024207 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -567,6 +567,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b7ebe2c87586..d783d7cc3348 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -895,9 +895,10 @@ static void hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static void @@ -1215,11 +1216,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; } - sch_tree_lock(sch); - hfsc_purge_queue(sch, cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 86b04e31e60b..13d6f83ec491 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; + unsigned int prev_backlog; idx = hhf_classify(skb, sch); @@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. @@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen; + unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) } qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); kfree_skb(skb); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, + prev_backlog - sch->qstats.backlog); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 15ccd7f8fb2a..87b02ed3d5f2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -600,6 +600,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) htb_activate(q, cl); } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -889,6 +890,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) ok: qdisc_bstats_update(sch, skb); qdisc_unthrottled(sch); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -955,6 +957,7 @@ static unsigned int htb_drop(struct Qdisc *sch) unsigned int len; if (cl->un.leaf.q->ops->drop && (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->qstats.backlog -= len; sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) htb_deactivate(q, cl); @@ -984,12 +987,12 @@ static void htb_reset(struct Qdisc *sch) } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; - } } qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); for (i = 0; i < TC_HTB_NUMPRIO; i++) @@ -1163,14 +1166,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, cl->common.classid)) == NULL) return -ENOBUFS; - sch_tree_lock(sch); - *old = cl->un.leaf.q; - cl->un.leaf.q = new; - if (*old != NULL) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->un.leaf.q); return 0; } @@ -1272,7 +1268,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - unsigned int qlen; struct Qdisc *new_q = NULL; int last_child = 0; @@ -1292,9 +1287,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); if (!cl->level) { - qlen = cl->un.leaf.q->q.qlen; + unsigned int qlen = cl->un.leaf.q->q.qlen; + unsigned int backlog = cl->un.leaf.q->qstats.backlog; + qdisc_reset(cl->un.leaf.q); - qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); } /* delete from hash and active; remainder in destroy_class */ @@ -1428,10 +1425,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); if (parent && !parent->level) { unsigned int qlen = parent->un.leaf.q->q.qlen; + unsigned int backlog = parent->un.leaf.q->qstats.backlog; /* turn parent into inner node */ qdisc_reset(parent->un.leaf.q); - qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); qdisc_destroy(parent->un.leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 3e82f047caaf..56a77b878eb3 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index ad70ecf57ce7..b8002ce3d010 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { @@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, 0); + dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); } @@ -124,7 +125,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + qdisc = qdisc_create_dflt(dev_queue, + get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { @@ -140,8 +142,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, + { .tc = qopt->num_tc }}; + priv->hw_owned = 1; - err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) goto err; } else { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 4e904ca0af9d..bcdd54bb101c 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, + child->qstats.backlog); qdisc_destroy(child); } } @@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -303,13 +305,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5abd1d9de989..9640bb39a5d2 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -598,7 +598,8 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, + qdisc_pkt_len(skb)); } } goto tfifo_dequeue; @@ -1037,15 +1038,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct netem_sched_data *q = qdisc_priv(sch); - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - if (*old) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b783a446d884..71ae3b9629f9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index ba6487f2741f..fee1b15506b2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; if (child != &noop_qdisc) { - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); qdisc_destroy(child); } } @@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -268,13 +269,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3dc3a6e56052..8d2d8d953432 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) static void qfq_purge_queue(struct qfq_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { @@ -617,11 +618,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - qfq_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6c0534cc7758..8c0508c0e287 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) q->flags = ctl->flags; q->limit = ctl->limit; if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -313,12 +314,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5bbb6332ec57..c69611640fa5 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; @@ -606,12 +607,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 3abab534eb5c..498f0a2cb47f 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -346,7 +346,7 @@ static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned int hash; + unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int uninitialized_var(ret); @@ -461,7 +461,7 @@ enqueue: return NET_XMIT_SUCCESS; qlen = slot->qlen; - sfq_drop(sch); + dropped = sfq_drop(sch); /* Return Congestion Notification only if we dropped a packet * from this flow. */ @@ -469,7 +469,7 @@ enqueue: return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } @@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch) struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; + unsigned int drop_len = 0; __skb_queue_head_init(&list); @@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch) if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; @@ -594,7 +596,7 @@ drop: } } sch->q.qlen -= dropped; - qdisc_tree_decrease_qlen(sch, dropped); + qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(unsigned long arg) @@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; - unsigned int qlen; + unsigned int qlen, dropped = 0; struct red_parms *p = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) @@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) qlen = sch->q.qlen; while (sch->q.qlen > q->limit) - sfq_drop(sch); - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + dropped += sfq_drop(sch); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); del_timer(&q->perturb_timer); if (q->perturb_period) { diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a4afde14e865..c2fbde742f37 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); + unsigned int len = 0, prev_len = qdisc_pkt_len(skb); int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); @@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) nskb = segs->next; segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; + len += segs->len; ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen += nb; if (nb > 1) - qdisc_tree_decrease_qlen(sch, 1 - nb); + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); consume_skb(skb); return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } @@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -502,13 +505,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 2bf8ec92dde4..a19b3e607703 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1263,7 +1263,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, if (score_curr > score_best) return curr; else if (score_curr == score_best) - return sctp_trans_elect_tie(curr, best); + return sctp_trans_elect_tie(best, curr); else return best; } @@ -1493,7 +1493,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) asoc->peer.sack_needed = 0; - sctp_outq_tail(&asoc->outqueue, sack); + sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC); /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a3380917f197..958ef5f33f4b 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } -void sctp_datamsg_free(struct sctp_datamsg *msg) -{ - struct sctp_chunk *chunk; - - /* This doesn't have to be a _safe vairant because - * sctp_chunk_free() only drops the refs. - */ - list_for_each_entry(chunk, &msg->chunks, frag_list) - sctp_chunk_free(chunk); - - sctp_datamsg_put(msg); -} - /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { @@ -273,7 +260,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, frag |= SCTP_DATA_SACK_IMM; } - chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0); + chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, + 0, GFP_KERNEL); if (!chunk) { err = -ENOMEM; @@ -309,7 +297,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; - chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0); + chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, + 0, GFP_KERNEL); if (!chunk) { err = -ENOMEM; diff --git a/net/sctp/input.c b/net/sctp/input.c index 49d2cc751386..db76f1ab4ac2 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -221,7 +221,7 @@ int sctp_rcv(struct sk_buff *skb) goto discard_release; /* Create an SCTP packet structure. */ - chunk = sctp_chunkify(skb, asoc, sk); + chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; @@ -937,7 +937,6 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_transport *t; struct sctp_association *asoc = NULL; - rcu_read_lock(); t = sctp_addrs_lookup_transport(net, local, peer); if (!t || !sctp_transport_hold(t)) goto out; @@ -949,7 +948,6 @@ static struct sctp_association *__sctp_lookup_association( sctp_transport_put(t); out: - rcu_read_unlock(); return asoc; } @@ -962,7 +960,9 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; + rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + rcu_read_unlock(); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ec529121f38a..ce46f1c7f133 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, } return 0; } + if (addr1->v6.sin6_port != addr2->v6.sin6_port) + return 0; if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 9d610eddd19e..736c004abfbc 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -153,7 +153,7 @@ void sctp_packet_free(struct sctp_packet *packet) */ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, - int one_packet) + int one_packet, gfp_t gfp) { sctp_xmit_t retval; int error = 0; @@ -163,7 +163,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { - error = sctp_packet_transmit(packet); + error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; @@ -376,7 +376,7 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) * * The return value is a normal kernel error return value. */ -int sctp_packet_transmit(struct sctp_packet *packet) +int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c0380cfb16ae..f03541d0f12d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q, static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout); +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, @@ -285,7 +285,7 @@ void sctp_outq_free(struct sctp_outq *q) } /* Put a new chunk in an sctp_outq. */ -int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) +int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = sock_net(q->asoc->base.sk); int error = 0; @@ -341,7 +341,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) return error; if (!q->cork) - error = sctp_outq_flush(q, 0); + error = sctp_outq_flush(q, 0, gfp); return error; } @@ -510,7 +510,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) - error = sctp_outq_flush(q, /* rtx_timeout */ 1); + error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); if (error) q->asoc->base.sk->sk_err = -error; @@ -601,12 +601,12 @@ redo: * control chunks are already freed so there * is nothing we can do. */ - sctp_packet_transmit(pkt); + sctp_packet_transmit(pkt, GFP_ATOMIC); goto redo; } /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* If we are retransmitting, we should only * send a single packet. @@ -622,7 +622,7 @@ redo: case SCTP_XMIT_RWND_FULL: /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* Stop sending DATA as there is no more room * at the receiver. @@ -632,7 +632,7 @@ redo: case SCTP_XMIT_DELAY: /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* Stop sending DATA because of nagle delay. */ done = 1; @@ -685,12 +685,12 @@ redo: } /* Cork the outqueue so queued chunks are really queued. */ -int sctp_outq_uncork(struct sctp_outq *q) +int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; - return sctp_outq_flush(q, 0); + return sctp_outq_flush(q, 0, gfp); } @@ -703,7 +703,7 @@ int sctp_outq_uncork(struct sctp_outq *q) * locking concerns must be made. Today we use the sock lock to protect * this function. */ -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_packet *packet; struct sctp_packet singleton; @@ -825,7 +825,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); sctp_packet_append_chunk(&singleton, chunk); - error = sctp_packet_transmit(&singleton); + error = sctp_packet_transmit(&singleton, gfp); if (error < 0) return error; break; @@ -856,7 +856,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: status = sctp_packet_transmit_chunk(packet, chunk, - one_packet); + one_packet, gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &q->control_chunk_list); @@ -1011,7 +1011,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) atomic_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ - status = sctp_packet_transmit_chunk(packet, chunk, 0); + status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp); switch (status) { case SCTP_XMIT_PMTU_FULL: @@ -1088,7 +1088,7 @@ sctp_flush_out: send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) - error = sctp_packet_transmit(packet); + error = sctp_packet_transmit(packet, gfp); /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); diff --git a/net/sctp/probe.c b/net/sctp/probe.c index 5e68b94ee640..6cc2152e0740 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -65,7 +65,7 @@ static struct { struct kfifo fifo; spinlock_t lock; wait_queue_head_t wait; - struct timespec tstart; + struct timespec64 tstart; } sctpw; static __printf(1, 2) void printl(const char *fmt, ...) @@ -85,7 +85,7 @@ static __printf(1, 2) void printl(const char *fmt, ...) static int sctpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(&sctpw.fifo); - getnstimeofday(&sctpw.tstart); + ktime_get_ts64(&sctpw.tstart); return 0; } @@ -138,7 +138,7 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, struct sk_buff *skb = chunk->skb; struct sctp_transport *sp; static __u32 lcwnd = 0; - struct timespec now; + struct timespec64 now; sp = asoc->peer.primary_path; @@ -149,8 +149,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, (full || sp->cwnd != lcwnd)) { lcwnd = sp->cwnd; - getnstimeofday(&now); - now = timespec_sub(now, sctpw.tstart); + ktime_get_ts64(&now); + now = timespec64_sub(now, sctpw.tstart); printl("%lu.%06lu ", (unsigned long) now.tv_sec, (unsigned long) now.tv_nsec / NSEC_PER_USEC); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ded7d931a6a5..5cfac8d5d3b3 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -161,7 +161,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa struct sctp_af *af; primary = &assoc->peer.primary_addr; - rcu_read_lock(); list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; @@ -172,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa } af->seq_dump_addr(seq, addr); } - rcu_read_unlock(); } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) @@ -482,7 +480,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; - struct sctp_transport *tsp; + struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " @@ -490,10 +488,10 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) return 0; } - tsp = (struct sctp_transport *)v; - if (!sctp_transport_hold(tsp)) + transport = (struct sctp_transport *)v; + if (!sctp_transport_hold(transport)) return 0; - assoc = tsp->asoc; + assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { @@ -546,7 +544,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(tsp); + sctp_transport_put(transport); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ab0d538a74ed..1099e99a53c4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -60,6 +60,8 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> +#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) + /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; @@ -1355,6 +1357,8 @@ static __init int sctp_init(void) unsigned long limit; int max_share; int order; + int num_entries; + int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); @@ -1407,14 +1411,24 @@ static __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. + * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); - for (order = 0; (1UL << order) < goal; order++) - ; + /* Then compute the page order for said goal */ + order = get_order(goal); + + /* Now compute the required page order for the maximum sized table we + * want to create + */ + max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * + sizeof(struct sctp_bind_hashbucket)); + + /* Limit the page order by that maximum hash table size */ + order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; @@ -1430,20 +1444,35 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } - /* Allocate and initialize the SCTP port hash table. */ + /* Allocate and initialize the SCTP port hash table. + * Note that order is initalized to start at the max sized + * table we want to support. If we can't get that many pages + * reduce the order and try again + */ do { - sctp_port_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_bind_hashbucket); - if ((sctp_port_hashsize > (64 * 1024)) && order > 0) - continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } + + /* Now compute the number of entries that will fit in the + * port hash space we allocated + */ + num_entries = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + + /* And finish by rounding it down to the nearest power of two + * this wastes some memory of course, but its needed because + * the hash function operates based on the assumption that + * that the number of entries is a power of two + */ + sctp_port_hashsize = rounddown_pow_of_two(num_entries); + for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); @@ -1452,7 +1481,8 @@ static __init int sctp_init(void) if (sctp_transport_hashtable_init()) goto err_thash_alloc; - pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize); + pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, + num_entries); sctp_sysctl_register(); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5d6a03fad378..8449ca26aa0b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -62,11 +62,13 @@ #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen); + __u8 type, __u8 flags, int paylen, + gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, - __u8 flags, int paylen); + __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen); + __u8 type, __u8 flags, int paylen, + gfp_t gfp); static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, @@ -318,7 +320,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, * PLEASE DO NOT FIXME [This version does not support Host Name.] */ - retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize); + retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; @@ -465,7 +467,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, num_ext); /* Now allocate and fill out the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize); + retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; @@ -570,7 +572,8 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len); + retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, + cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = @@ -615,7 +618,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0); + retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -664,7 +667,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, - sizeof(sctp_cwrhdr_t)); + sizeof(sctp_cwrhdr_t), GFP_ATOMIC); if (!retval) goto nodata; @@ -698,7 +701,7 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, - sizeof(sctp_ecnehdr_t)); + sizeof(sctp_ecnehdr_t), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = @@ -713,7 +716,8 @@ nodata: */ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, - int data_len, __u8 flags, __u16 ssn) + int data_len, __u8 flags, __u16 ssn, + gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; @@ -734,7 +738,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, dp.ssn = htons(ssn); chunk_len = sizeof(dp) + data_len; - retval = sctp_make_data(asoc, flags, chunk_len); + retval = sctp_make_data(asoc, flags, chunk_len, gfp); if (!retval) goto nodata; @@ -781,7 +785,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len); + retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; @@ -861,7 +865,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, - sizeof(sctp_shutdownhdr_t)); + sizeof(sctp_shutdownhdr_t), GFP_ATOMIC); if (!retval) goto nodata; @@ -879,7 +883,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0); + retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, + GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -908,7 +913,8 @@ struct sctp_chunk *sctp_make_shutdown_complete( */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; - retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0); + retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, + 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -947,7 +953,8 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, flags = SCTP_CHUNK_FLAG_T; } - retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint); + retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, + GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -1139,7 +1146,8 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, struct sctp_chunk *retval; sctp_sender_hb_info_t hbinfo; - retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo)); + retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, + sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; @@ -1167,7 +1175,8 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen); + retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, + GFP_ATOMIC); if (!retval) goto nodata; @@ -1200,7 +1209,7 @@ static struct sctp_chunk *sctp_make_op_error_space( struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, - sizeof(sctp_errhdr_t) + size); + sizeof(sctp_errhdr_t) + size, GFP_ATOMIC); if (!retval) goto nodata; @@ -1271,7 +1280,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, - hmac_desc->hmac_len + sizeof(sctp_authhdr_t)); + hmac_desc->hmac_len + sizeof(sctp_authhdr_t), + GFP_ATOMIC); if (!retval) return NULL; @@ -1309,11 +1319,11 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, - struct sock *sk) + struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; - retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC); + retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; @@ -1361,7 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen) + __u8 type, __u8 flags, int paylen, + gfp_t gfp) { struct sctp_chunk *retval; sctp_chunkhdr_t *chunk_hdr; @@ -1369,8 +1380,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sock *sk; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), - GFP_ATOMIC); + skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); if (!skb) goto nodata; @@ -1381,7 +1391,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t)); sk = asoc ? asoc->base.sk : NULL; - retval = sctp_chunkify(skb, asoc, sk); + retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; @@ -1400,16 +1410,18 @@ nodata: } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, - __u8 flags, int paylen) + __u8 flags, int paylen, gfp_t gfp) { - return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen); + return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen) + __u8 type, __u8 flags, int paylen, + gfp_t gfp) { - struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen); + struct sctp_chunk *chunk; + chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); @@ -2756,7 +2768,8 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, length += addrlen; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length); + retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, + GFP_ATOMIC); if (!retval) return NULL; @@ -2940,7 +2953,8 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length); + retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, + GFP_ATOMIC); if (!retval) return NULL; @@ -3500,7 +3514,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, hint = (nstreams + 1) * sizeof(__u32); - retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint); + retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b5327bb77458..3c22c41a2bc2 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1019,13 +1019,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, * encouraged for small fragments. */ static int sctp_cmd_send_msg(struct sctp_association *asoc, - struct sctp_datamsg *msg) + struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; int error = 0; list_for_each_entry(chunk, &msg->chunks, frag_list) { - error = sctp_outq_tail(&asoc->outqueue, chunk); + error = sctp_outq_tail(&asoc->outqueue, chunk, gfp); if (error) break; } @@ -1249,7 +1249,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_NEW_ASOC: /* Register a new association. */ if (local_cork) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } @@ -1269,7 +1269,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_DELETE_TCB: if (local_cork) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Delete the current association. */ @@ -1423,13 +1423,14 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, local_cork = 1; } /* Send a chunk to our peer. */ - error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk); + error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, + gfp); break; case SCTP_CMD_SEND_PKT: /* Send a full packet to our peer. */ packet = cmd->obj.packet; - sctp_packet_transmit(packet); + sctp_packet_transmit(packet, gfp); sctp_ootb_pkt_free(packet); break; @@ -1639,7 +1640,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, */ chunk->pdiscard = 1; if (asoc) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } break; @@ -1677,7 +1678,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; @@ -1704,7 +1705,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_outq_cork(&asoc->outqueue); local_cork = 1; } - error = sctp_cmd_send_msg(asoc, cmd->obj.msg); + error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_SEND_NEXT_ASCONF: sctp_cmd_send_asconf(asoc); @@ -1734,9 +1735,9 @@ out: */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); return error; nomem: error = -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6427b9d1197e..b89501e5c1a1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; + int i; if (!ep->auth_enable) return -EACCES; @@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; - if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) - return -EFAULT; + for (i = 0; i < num_idents; i++) { + __u16 hmacid = ntohs(hmacs->hmac_ids[i]); + + if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) + return -EFAULT; + } return 0; } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a431c14044a4..d517153891a6 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); - peer->last_time_heard = ktime_get(); + peer->last_time_heard = ktime_set(0, 0); peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | diff --git a/net/socket.c b/net/socket.c index c044d1e8508c..c5ddc52cf2b2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = { * NULL is returned. */ -static struct socket *sock_alloc(void) +struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; @@ -554,6 +554,7 @@ static struct socket *sock_alloc(void) this_cpu_add(sockets_in_use, 1); return sock; } +EXPORT_SYMBOL(sock_alloc); /** * sock_release - close a socket @@ -1106,12 +1107,8 @@ int __sock_create(struct net *net, int family, int type, int protocol, deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { - static int warned; - if (!warned) { - warned = 1; - pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n", - current->comm); - } + pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", + current->comm); family = PF_PACKET; } @@ -1874,7 +1871,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, - struct used_address *used_address) + struct used_address *used_address, + unsigned int allowed_msghdr_flags) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; @@ -1900,6 +1898,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, if (msg_sys->msg_controllen > INT_MAX) goto out_freeiov; + flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = @@ -1978,7 +1977,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags) if (!sock) goto out; - err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); fput_light(sock->file, fput_needed); out: @@ -2005,6 +2004,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; + unsigned int oflags = flags; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; @@ -2019,11 +2019,15 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; + flags |= MSG_BATCH; while (datagrams < vlen) { + if (datagrams == vlen - 1) + flags = oflags; + if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); @@ -2031,7 +2035,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 799e65b944b9..cabf586f47d7 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -740,7 +740,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); - BUG(); + gss_msg->msg.errno = -EIO; } goto err_release_msg; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 2b32fd602669..273bc3a35425 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1225,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize) if (bp[0] == '\\' && bp[1] == 'x') { /* HEX STRING */ bp += 2; - while (len < bufsize) { + while (len < bufsize - 1) { int h, l; h = hex_to_bin(bp[0]); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index cc1251d07297..2dcd7640eeb5 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -341,6 +341,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; rqst->rq_xid = headerp->rm_xid; + + rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 47f7da58a7f0..8b5833c1ff2e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1093,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .cb = cb, .idx = idx, }; + int err; - switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); + err = switchdev_port_obj_dump(dev, &dump.fdb.obj, + switchdev_port_fdb_dump_cb); + cb->args[1] = err; return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index e401108360a2..ae469b37d852 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -412,11 +412,6 @@ enomem: return -ENOMEM; } -void tipc_bcast_reinit(struct net *net) -{ - tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net)); -} - void tipc_bcast_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 1944c6c00bb9..d5e79b3767fd 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -46,7 +46,6 @@ struct tipc_node_map; extern const char tipc_bclink_name[]; int tipc_bcast_init(struct net *net); -void tipc_bcast_reinit(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, struct sk_buff_head *xmitq); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 802ffad3200d..27a5406213c6 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -40,6 +40,7 @@ #include "link.h" #include "discover.h" #include "bcast.h" +#include "netlink.h" #define MAX_ADDR_STR 60 @@ -54,23 +55,6 @@ static struct tipc_media * const media_info_array[] = { NULL }; -static const struct nla_policy -tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { - [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_BEARER_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_BEARER_NAME - }, - [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { - [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, - [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } -}; - static void bearer_disable(struct net *net, struct tipc_bearer *b); /** diff --git a/net/tipc/link.c b/net/tipc/link.c index 6f4a6d9b0149..7d2bb3e70baa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012-2015, Ericsson AB + * Copyright (c) 1996-2007, 2012-2016, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -127,6 +127,7 @@ struct tipc_link { /* Management and link supervision data */ u32 peer_session; + u32 session; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; @@ -136,11 +137,7 @@ struct tipc_link { u16 peer_caps; bool active; u32 silent_intv_cnt; - struct { - unchar hdr[INT_H_SIZE]; - unchar body[TIPC_MAX_IF_NAME]; - } proto_msg; - struct tipc_msg *pmsg; + char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; @@ -195,14 +192,6 @@ struct tipc_link { static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -/* Properties valid for media, bearar and link */ -static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { - [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } -}; - /* Send states for broadcast NACKs */ enum { @@ -215,10 +204,11 @@ enum { * Interval between NACKs when packets arrive out of order */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* - * Out-of-range value for link session numbers + +/* Wildcard value for link session numbers. When it is known that + * peer endpoint is down, any session number must be accepted. */ -#define WILDCARD_SESSION 0x10000 +#define ANY_SESSION 0x10000 /* Link FSM states: */ @@ -398,16 +388,6 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } -static u32 link_own_addr(struct tipc_link *l) -{ - return msg_prevnode(l->pmsg); -} - -void tipc_link_reinit(struct tipc_link *l, u32 addr) -{ - msg_set_prevnode(l->pmsg, addr); -} - /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -441,29 +421,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct tipc_link **link) { struct tipc_link *l; - struct tipc_msg *hdr; l = kzalloc(sizeof(*l), GFP_ATOMIC); if (!l) return false; *link = l; - l->pmsg = (struct tipc_msg *)&l->proto_msg; - hdr = l->pmsg; - tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer); - msg_set_size(hdr, sizeof(l->proto_msg)); - msg_set_session(hdr, session); - msg_set_bearer_id(hdr, l->bearer_id); + l->session = session; /* Note: peer i/f name is completed by reset/activate message */ sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - strcpy((char *)msg_data(hdr), if_name); - + strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = WILDCARD_SESSION; + l->peer_session = ANY_SESSION; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -790,7 +763,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) struct tipc_msg *msg = buf_msg(skb_peek(list)); int imp = msg_importance(msg); u32 oport = msg_origport(msg); - u32 addr = link_own_addr(link); + u32 addr = tipc_own_addr(link->net); struct sk_buff *skb; /* This really cannot happen... */ @@ -839,16 +812,9 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - /* Link is down, accept any session */ - l->peer_session = WILDCARD_SESSION; - - /* If peer is up, it only accepts an incremented session number */ - msg_set_session(l->pmsg, msg_session(l->pmsg) + 1); - - /* Prepare for renewed mtu size negotiation */ + l->peer_session = ANY_SESSION; + l->session++; l->mtu = l->advertised_mtu; - - /* Clean up all queues and counters: */ __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); skb_queue_splice_init(&l->wakeupq, l->inputq); @@ -903,8 +869,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) return link_schedule_user(l, list); } - if (unlikely(msg_size(hdr) > mtu)) + if (unlikely(msg_size(hdr) > mtu)) { + skb_queue_purge(list); return -EMSGSIZE; + } /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { @@ -916,8 +884,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (likely(skb_queue_len(transmq) < maxwin)) { _skb = skb_clone(skb, GFP_ATOMIC); - if (!_skb) + if (!_skb) { + skb_queue_purge(list); return -ENOBUFS; + } __skb_dequeue(list); __skb_queue_tail(transmq, skb); __skb_queue_tail(xmitq, _skb); @@ -1152,7 +1122,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) /* Broadcast ACK must be sent via a unicast link => defer to caller */ if (link_is_bc_rcvlink(l)) { - if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf) + if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) return 0; l->rcv_unacked = 0; return TIPC_LINK_SND_BC_ACK; @@ -1264,15 +1234,30 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) { - struct sk_buff *skb = NULL; - struct tipc_msg *hdr = l->pmsg; + struct sk_buff *skb; + struct tipc_msg *hdr; + struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) return; - msg_set_type(hdr, mtyp); + if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) + return; + + if (!skb_queue_empty(dfq)) + rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, + TIPC_MAX_IF_NAME, l->addr, + tipc_own_addr(l->net), 0, 0, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + msg_set_session(hdr, l->session); + msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); msg_set_next_sent(hdr, l->snd_nxt); msg_set_ack(hdr, l->rcv_nxt - 1); @@ -1282,36 +1267,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_linkprio(hdr, priority); msg_set_redundant_link(hdr, node_up); msg_set_seq_gap(hdr, 0); - - /* Compatibility: created msg must not be in sequence with pkt flow */ msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { - if (!tipc_link_is_up(l)) - return; - - /* Override rcvgap if there are packets in deferred queue */ - if (!skb_queue_empty(&l->deferdq)) - rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt; - if (rcvgap) { - msg_set_seq_gap(hdr, rcvgap); - l->stats.sent_nacks++; - } + msg_set_seq_gap(hdr, rcvgap); + msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); - if (probe) - l->stats.sent_probes++; l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - msg_set_ack(hdr, l->rcv_nxt - 1); - msg_set_next_sent(hdr, 1); + strcpy(msg_data(hdr), l->if_name); } - skb = tipc_buf_acquire(msg_size(hdr)); - if (!skb) - return; - skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + if (probe) + l->stats.sent_probes++; + if (rcvgap) + l->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); } @@ -1336,7 +1308,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, /* At least one packet required for safe algorithm => add dummy */ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, link_own_addr(l), + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, TIPC_ERR_NO_PORT); if (!skb) { pr_warn("%sunable to create tunnel packet\n", link_co_err); @@ -1347,7 +1319,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, __skb_queue_purge(&tmpxq); /* Initialize reusable tunnel packet header */ - tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL, + tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); msg_set_msgcnt(&tnlhdr, pktcnt); @@ -1406,7 +1378,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_link_is_blocked(l) || !xmitq) goto exit; - if (link_own_addr(l) > msg_prevnode(hdr)) + if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); switch (mtyp) { @@ -1414,7 +1386,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Ignore duplicate RESET with old session number */ if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != WILDCARD_SESSION)) + (l->peer_session != ANY_SESSION)) break; /* fall thru' */ @@ -1511,7 +1483,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast, u16 gap_to = peers_snd_nxt - 1; skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, - 0, l->addr, link_own_addr(l), 0, 0, 0); + 0, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return false; hdr = buf_msg(skb); @@ -1666,7 +1638,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (mtyp != STATE_MSG) return 0; - if (dnode == link_own_addr(l)) { + if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq); l->stats.recv_nacks++; @@ -1958,8 +1930,10 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) + if (!hdr) { + tipc_bcast_unlock(net); return -EMSGSIZE; + } attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); if (!attrs) diff --git a/net/tipc/link.h b/net/tipc/link.h index b4ee9d6e181d..6a94175ee20a 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -86,7 +86,6 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, struct tipc_link **link); -void tipc_link_reinit(struct tipc_link *l, u32 addr); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 777b979b8463..e190460fe0d3 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -47,12 +47,6 @@ #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ -static const struct nla_policy -tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { - [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } -}; - /** * struct name_info - name sequence publication info * @node_list: circular list of publications made by own node diff --git a/net/tipc/net.c b/net/tipc/net.c index 77bf9113c7a7..28bf4feeb81c 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -41,11 +41,7 @@ #include "socket.h" #include "node.h" #include "bcast.h" - -static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { - [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NET_ID] = { .type = NLA_U32 } -}; +#include "netlink.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -116,7 +112,6 @@ int tipc_net_start(struct net *net, u32 addr) tn->own_addr = addr; tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_bcast_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, TIPC_ZONE_SCOPE, 0, tn->own_addr); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 8975b0135b76..56935df2167a 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } }; +const struct nla_policy +tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { + [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { + [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, + [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } +}; + +const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { + [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NET_ID] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_LINK_NAME }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { + [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } +}; + +/* Properties valid for media, bearer and link */ +const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { + [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { + [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_BEARER_NAME }, + [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { + [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, + [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + /* Users of the legacy API (tipc-config) can't handle that we add operations, * so we have a separate genl handling for the new API. */ diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 08a1db67b927..ed1dbcb4afbd 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -35,6 +35,7 @@ #ifndef _TIPC_NETLINK_H #define _TIPC_NETLINK_H +#include <net/netlink.h> extern struct genl_family tipc_genl_family; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); @@ -45,6 +46,16 @@ struct tipc_nl_msg { u32 seq; }; +extern const struct nla_policy tipc_nl_name_table_policy[]; +extern const struct nla_policy tipc_nl_sock_policy[]; +extern const struct nla_policy tipc_nl_net_policy[]; +extern const struct nla_policy tipc_nl_link_policy[]; +extern const struct nla_policy tipc_nl_node_policy[]; +extern const struct nla_policy tipc_nl_prop_policy[]; +extern const struct nla_policy tipc_nl_bearer_policy[]; +extern const struct nla_policy tipc_nl_media_policy[]; +extern const struct nla_policy tipc_nl_udp_policy[]; + int tipc_netlink_start(void); int tipc_netlink_compat_start(void); void tipc_netlink_stop(void); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2c016fdefe97..d7d050f44fc1 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1104,8 +1104,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; - msg.dst_sk = info->dst_sk; msg.net = genl_info_net(info); + msg.dst_sk = skb->sk; if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); diff --git a/net/tipc/node.c b/net/tipc/node.c index f8a8255a7182..ace178fd3850 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -41,6 +41,7 @@ #include "socket.h" #include "bcast.h" #include "discover.h" +#include "netlink.h" #define INVALID_NODE_SIG 0x10000 @@ -164,28 +165,6 @@ struct tipc_sock_conn { struct list_head list; }; -static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { - [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_LINK_NAME - }, - [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { - [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } -}; - static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; @@ -225,9 +204,10 @@ static unsigned int tipc_hashfn(u32 addr) static void tipc_node_kref_release(struct kref *kref) { - struct tipc_node *node = container_of(kref, struct tipc_node, kref); + struct tipc_node *n = container_of(kref, struct tipc_node, kref); - tipc_node_delete(node); + kfree(n->bc_entry.link); + kfree_rcu(n, rcu); } static void tipc_node_put(struct tipc_node *node) @@ -245,23 +225,23 @@ static void tipc_node_get(struct tipc_node *node) */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node; + unsigned int thash = tipc_hashfn(addr); if (unlikely(!in_own_cluster_exact(net, addr))) return NULL; rcu_read_lock(); - hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], - hash) { - if (node->addr == addr) { - tipc_node_get(node); - rcu_read_unlock(); - return node; - } + hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { + if (node->addr != addr) + continue; + if (!kref_get_unless_zero(&node->kref)) + node = NULL; + break; } rcu_read_unlock(); - return NULL; + return node; } static void tipc_node_read_lock(struct tipc_node *n) @@ -346,12 +326,6 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); - hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); - list_for_each_entry_rcu(temp_node, &tn->node_list, list) { - if (n->addr < temp_node->addr) - break; - } - list_add_tail_rcu(&n->list, &temp_node->list); n->state = SELF_DOWN_PEER_LEAVING; n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; @@ -372,6 +346,12 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) tipc_node_get(n); setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); n->keepalive_intv = U32_MAX; + hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); exit: spin_unlock_bh(&tn->node_list_lock); return n; @@ -395,21 +375,20 @@ static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); - kfree(node->bc_entry.link); - kfree_rcu(node, rcu); + tipc_node_put(node); + + del_timer_sync(&node->timer); + tipc_node_put(node); } void tipc_node_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) { - if (del_timer(&node->timer)) - tipc_node_put(node); - tipc_node_put(node); - } + list_for_each_entry_safe(node, t_node, &tn->node_list, list) + tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } @@ -530,9 +509,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) - tipc_node_get(n); - tipc_node_put(n); + mod_timer(&n->timer, jiffies + n->keepalive_intv); } /** @@ -845,7 +822,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && !tipc_link_is_reset(l)) + if (reset && l && !tipc_link_is_reset(l)) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } @@ -1166,7 +1143,7 @@ msg_full: * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1174,33 +1151,43 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; - int bearer_id = -1; - int rc = -EHOSTUNREACH; + int bearer_id; + int rc; + + if (in_own_node(net, dnode)) { + tipc_sk_rcv(net, list); + return 0; + } - __skb_queue_head_init(&xmitq); n = tipc_node_find(net, dnode); - if (likely(n)) { - tipc_node_read_lock(n); - bearer_id = n->active_links[selector & 1]; - if (bearer_id >= 0) { - le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); - rc = tipc_link_xmit(le->link, list, &xmitq); - spin_unlock_bh(&le->lock); - } + if (unlikely(!n)) { + skb_queue_purge(list); + return -EHOSTUNREACH; + } + + tipc_node_read_lock(n); + bearer_id = n->active_links[selector & 1]; + if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); - if (likely(!rc)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) - tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); - return rc; + skb_queue_purge(list); + return -EHOSTUNREACH; } - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } + __skb_queue_head_init(&xmitq); + le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); + rc = tipc_link_xmit(le->link, list, &xmitq); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(n); + + if (likely(rc == 0)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + else if (rc == -ENOBUFS) + tipc_node_link_down(n, bearer_id, false); + + tipc_node_put(n); + return rc; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 69c29050f14a..3eeb50a27b89 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "netlink.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -126,14 +127,6 @@ static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; static struct proto tipc_proto; -static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { - [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, - [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } -}; - static const struct rhashtable_params tsk_rht_params; /* @@ -673,7 +666,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; struct iov_iter save = msg->msg_iter; uint mtu; int rc; @@ -687,14 +680,16 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, msg_set_nameupper(mhdr, seq->upper); msg_set_hdr_sz(mhdr, MCAST_H_SIZE); + skb_queue_head_init(&pktchain); + new_mtu: mtu = tipc_bcast_get_mtu(net); - rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain); + rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain); if (unlikely(rc < 0)) return rc; do { - rc = tipc_bcast_xmit(net, pktchain); + rc = tipc_bcast_xmit(net, &pktchain); if (likely(!rc)) return dsz; @@ -704,7 +699,7 @@ new_mtu: if (!rc) continue; } - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); if (rc == -EMSGSIZE) { msg->msg_iter = save; goto new_mtu; @@ -863,7 +858,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) struct net *net = sock_net(sk); struct tipc_msg *mhdr = &tsk->phdr; u32 dnode, dport; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; struct sk_buff *skb; struct tipc_name_seq *seq; struct iov_iter save; @@ -924,17 +919,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) msg_set_hdr_sz(mhdr, BASIC_H_SIZE); } + skb_queue_head_init(&pktchain); save = m->msg_iter; new_mtu: mtu = tipc_node_get_mtu(net, dnode, tsk->portid); - rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain); + rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain); if (rc < 0) return rc; do { - skb = skb_peek(pktchain); + skb = skb_peek(&pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid); + rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; @@ -946,7 +942,7 @@ new_mtu: if (!rc) continue; } - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); if (rc == -EMSGSIZE) { m->msg_iter = save; goto new_mtu; @@ -1016,7 +1012,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); u32 portid = tsk->portid; int rc = -EINVAL; @@ -1044,17 +1040,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); dnode = tsk_peer_node(tsk); + skb_queue_head_init(&pktchain); next: save = m->msg_iter; mtu = tsk->max_pkt; send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain); + rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); if (unlikely(rc < 0)) return rc; + do { if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_node_xmit(net, pktchain, dnode, portid); + rc = tipc_node_xmit(net, &pktchain, dnode, portid); if (likely(!rc)) { tsk->sent_unacked++; sent += send; @@ -1063,7 +1061,7 @@ next: goto next; } if (rc == -EMSGSIZE) { - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); tsk->max_pkt = tipc_node_get_mtu(net, dnode, portid); m->msg_iter = save; @@ -1077,7 +1075,7 @@ next: rc = tipc_wait_for_sndpkt(sock, &timeo); } while (!rc); - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); return sent ? sent : rc; } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 22963cafd5ed..e6cb386fbf34 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -326,7 +326,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, return tipc_subscrp_cancel(s, subscriber); } - tipc_subscrp_subscribe(net, s, subscriber, swap); + if (s) + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d63a911e7fe2..c94f9a15e2cd 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -48,20 +48,13 @@ #include <linux/tipc_netlink.h> #include "core.h" #include "bearer.h" +#include "netlink.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 #define UDP_MIN_HEADROOM 28 -static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { - [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, - [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, - [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, -}; - /** * struct udp_media_addr - IP/UDP addressing information * @@ -181,6 +174,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, err = PTR_ERR(rt); goto tx_error; } + + skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, @@ -201,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, src->udp_port, + &dst->ipv6, 0, ttl, 0, src->udp_port, dst->udp_port, false); #endif } @@ -274,7 +269,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, struct udp_media_addr *remote) { struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; - struct sockaddr_storage *sa_local, *sa_remote; + struct sockaddr_storage sa_local, sa_remote; if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; @@ -283,41 +278,48 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, tipc_nl_udp_policy)) goto err; if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { - sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); - sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], + sizeof(sa_local)); + nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], + sizeof(sa_remote)); } else { err: pr_err("Invalid UDP bearer configuration"); return -EINVAL; } - if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { struct sockaddr_in *ip4; - ip4 = (struct sockaddr_in *)sa_local; + ip4 = (struct sockaddr_in *)&sa_local; local->proto = htons(ETH_P_IP); local->udp_port = ip4->sin_port; local->ipv4.s_addr = ip4->sin_addr.s_addr; - ip4 = (struct sockaddr_in *)sa_remote; + ip4 = (struct sockaddr_in *)&sa_remote; remote->proto = htons(ETH_P_IP); remote->udp_port = ip4->sin_port; remote->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) - } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { + int atype; struct sockaddr_in6 *ip6; - ip6 = (struct sockaddr_in6 *)sa_local; + ip6 = (struct sockaddr_in6 *)&sa_local; + atype = ipv6_addr_type(&ip6->sin6_addr); + if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) + return -EINVAL; + local->proto = htons(ETH_P_IPV6); local->udp_port = ip6->sin6_port; - local->ipv6 = ip6->sin6_addr; + memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); ub->ifindex = ip6->sin6_scope_id; - ip6 = (struct sockaddr_in6 *)sa_remote; + ip6 = (struct sockaddr_in6 *)&sa_remote; remote->proto = htons(ETH_P_IPV6); remote->udp_port = ip6->sin6_port; - remote->ipv6 = ip6->sin6_addr; + memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b3745557fc89..8269da73e9e5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1558,7 +1558,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } @@ -1778,7 +1778,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -2274,13 +2279,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) size_t size = state->size; unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags & MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); @@ -2302,6 +2309,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) bool drop_skb; struct sk_buff *skb, *last; +redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -2326,9 +2334,11 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo, last, @@ -2341,7 +2351,7 @@ again: } mutex_lock(&u->readlock); - continue; + goto redo; unlock: unix_state_unlock(sk); break; diff --git a/net/unix/diag.c b/net/unix/diag.c index c512f64d5287..4d9679701a6d 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -220,7 +220,7 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(int ino) +static struct sock *unix_lookup_by_ino(unsigned int ino) { int i; struct sock *sk; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8fcdc2283af5..6a0d48525fcf 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp) * descriptor if it is for an AF_UNIX socket. */ -void unix_inflight(struct file *fp) +void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -133,11 +133,11 @@ void unix_inflight(struct file *fp) } unix_tot_inflight++; } - fp->f_cred->user->unix_inflight++; + user->unix_inflight++; spin_unlock(&unix_gc_lock); } -void unix_notinflight(struct file *fp) +void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp) list_del_init(&u->link); unix_tot_inflight--; } - fp->f_cred->user->unix_inflight--; + user->unix_inflight--; spin_unlock(&unix_gc_lock); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7fd1220fbfa0..bbe65dcb9738 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (total_written < len) { ssize_t written; @@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); @@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; } - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } /* These checks occur both as part of and after the loop @@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, out_wait: if (total_written > 0) err = total_written; - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; @@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (1) { s64 ready = vsock_stream_has_data(vsk); @@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, */ err = -ENOMEM; - goto out_wait; + goto out; } else if (ready > 0) { ssize_t read; @@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) - goto out_wait; + goto out; if (read >= target || flags & MSG_PEEK) break; @@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, break; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { @@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = -EAGAIN; break; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } } @@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = copied; } -out_wait: - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index da72ed32f143..6c606120abfe 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -50,8 +50,8 @@ config CFG80211_DEVELOPER_WARNINGS default n help This option enables some additional warnings that help - cfg80211 developers and driver developers, but that can - trigger due to races with userspace. + cfg80211 developers and driver developers, but beware that + they can also trigger due to races with userspace. For example, when a driver reports that it was disconnected from the AP, but the user disconnects manually at the same @@ -61,19 +61,6 @@ config CFG80211_DEVELOPER_WARNINGS on it (or mac80211). -config CFG80211_REG_DEBUG - bool "cfg80211 regulatory debugging" - depends on CFG80211 - default n - ---help--- - You can enable this if you want to debug regulatory changes. - For more information on cfg80211 regulatory refer to the wireless - wiki: - - http://wireless.kernel.org/en/developers/Regulatory - - If unsure, say N. - config CFG80211_CERTIFICATION_ONUS bool "cfg80211 certification onus" depends on CFG80211 && EXPERT @@ -123,7 +110,7 @@ config CFG80211_REG_RELAX_NO_IR interface which associated to an AP which userspace assumes or confirms to be an authorized master, i.e., with radar detection support and DFS capabilities. However, note that in order to not create daisy chain - scenarios, this relaxation is not allowed in cases that the BSS client + scenarios, this relaxation is not allowed in cases where the BSS client is associated to P2P GO and in addition the P2P GO instantiated on a channel due to this relaxation should not allow connection from non P2P clients. @@ -148,7 +135,7 @@ config CFG80211_DEBUGFS depends on CFG80211 depends on DEBUG_FS ---help--- - You can enable this if you want to debugfs entries for cfg80211. + You can enable this if you want debugfs entries for cfg80211. If unsure, say N. @@ -159,7 +146,7 @@ config CFG80211_INTERNAL_REGDB ---help--- This option generates an internal data structure representing the wireless regulatory rules described in net/wireless/db.txt - and includes code to query that database. This is an alternative + and includes code to query that database. This is an alternative to using CRDA for defining regulatory rules for the kernel. Using this option requires some parsing of the db.txt at build time, @@ -172,7 +159,7 @@ config CFG80211_INTERNAL_REGDB http://wireless.kernel.org/en/developers/Regulatory - Most distributions have a CRDA package. So if unsure, say N. + Most distributions have a CRDA package. So if unsure, say N. config CFG80211_CRDA_SUPPORT bool "support CRDA" if CFG80211_INTERNAL_REGDB diff --git a/net/wireless/core.c b/net/wireless/core.c index b0915515640e..9f1c4aa851ef 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -352,6 +352,16 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, WARN_ON(ops->add_station && !ops->del_station); WARN_ON(ops->add_mpath && !ops->del_mpath); WARN_ON(ops->join_mesh && !ops->leave_mesh); + WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device); + WARN_ON(ops->start_ap && !ops->stop_ap); + WARN_ON(ops->join_ocb && !ops->leave_ocb); + WARN_ON(ops->suspend && !ops->resume); + WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop); + WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel); + WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch); + WARN_ON(ops->add_tx_ts && !ops->del_tx_ts); + WARN_ON(ops->set_tx_power && !ops->get_tx_power); + WARN_ON(ops->set_antenna && !ops->get_antenna); alloc_size = sizeof(*rdev) + sizeof_priv; @@ -1147,6 +1157,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, return NOTIFY_DONE; } + wireless_nlevent_flush(); + return NOTIFY_OK; } diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index fb44fa3bf4ef..ff328250bc44 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -711,7 +711,7 @@ EXPORT_SYMBOL(cfg80211_rx_mgmt); void cfg80211_dfs_channels_update_work(struct work_struct *work) { - struct delayed_work *delayed_work; + struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; @@ -721,7 +721,6 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) unsigned long timeout, next_time = 0; int bandid, i; - delayed_work = container_of(work, struct delayed_work, work); rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d4786f2802aa..98c924260b3d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2016 Intel Deutschland GmbH */ #include <linux/if.h> @@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, + [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) + return -EOPNOTSUPP; + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -7281,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) } if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { - if (!(rdev->wiphy.features & - NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + if (!((rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) && + (rdev->wiphy.features & NL80211_FEATURE_QUIET)) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_RRM)) return -EINVAL; req.flags |= ASSOC_REQ_USE_RRM; } @@ -7547,7 +7554,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) && no_ht) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } } @@ -7971,15 +7978,23 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { - if (!(rdev->wiphy.features & - NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) { + if (!((rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) && + (rdev->wiphy.features & NL80211_FEATURE_QUIET)) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_RRM)) { kzfree(connkeys); return -EINVAL; } connect.flags |= ASSOC_REQ_USE_RRM; } + connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) { + kzfree(connkeys); + return -EOPNOTSUPP; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 722da616438c..6582d155e2fc 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -43,6 +43,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = { [IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, }, [IEEE80211_RADIOTAP_MCS] = { .align = 1, .size = 3, }, [IEEE80211_RADIOTAP_AMPDU_STATUS] = { .align = 4, .size = 8, }, + [IEEE80211_RADIOTAP_VHT] = { .align = 2, .size = 12, }, /* * add more here as they are defined in radiotap.h */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 547ceecc0523..c5fb317eee68 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -60,13 +60,6 @@ #include "regdb.h" #include "nl80211.h" -#ifdef CONFIG_CFG80211_REG_DEBUG -#define REG_DBG_PRINT(format, args...) \ - printk(KERN_DEBUG pr_fmt(format), ##args) -#else -#define REG_DBG_PRINT(args...) -#endif - /* * Grace period we give before making sure all current interfaces reside on * channels allowed by the current regulatory domain. @@ -178,12 +171,10 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) if (wiphy_regd->dfs_region == regd->dfs_region) goto out; - REG_DBG_PRINT("%s: device specific dfs_region " - "(%s) disagrees with cfg80211's " - "central dfs_region (%s)\n", - dev_name(&wiphy->dev), - reg_dfs_region_str(wiphy_regd->dfs_region), - reg_dfs_region_str(regd->dfs_region)); + pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n", + dev_name(&wiphy->dev), + reg_dfs_region_str(wiphy_regd->dfs_region), + reg_dfs_region_str(regd->dfs_region)); out: return regd->dfs_region; @@ -543,7 +534,7 @@ static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work); static void crda_timeout_work(struct work_struct *work) { - REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); + pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); rtnl_lock(); reg_crda_timeouts++; restore_regulatory_settings(true); @@ -585,7 +576,7 @@ static int call_crda(const char *alpha2) if (!is_world_regdom((char *) alpha2)) pr_debug("Calling CRDA for country: %c%c\n", - alpha2[0], alpha2[1]); + alpha2[0], alpha2[1]); else pr_debug("Calling CRDA to update world regulatory domain\n"); @@ -1132,42 +1123,6 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator) } EXPORT_SYMBOL(reg_initiator_name); -static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, - struct ieee80211_channel *chan, - const struct ieee80211_reg_rule *reg_rule) -{ -#ifdef CONFIG_CFG80211_REG_DEBUG - const struct ieee80211_power_rule *power_rule; - const struct ieee80211_freq_range *freq_range; - char max_antenna_gain[32], bw[32]; - - power_rule = ®_rule->power_rule; - freq_range = ®_rule->freq_range; - - if (!power_rule->max_antenna_gain) - snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A"); - else - snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi", - power_rule->max_antenna_gain); - - if (reg_rule->flags & NL80211_RRF_AUTO_BW) - snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", - freq_range->max_bandwidth_khz, - reg_get_max_bandwidth(regd, reg_rule)); - else - snprintf(bw, sizeof(bw), "%d KHz", - freq_range->max_bandwidth_khz); - - REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n", - chan->center_freq); - - REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n", - freq_range->start_freq_khz, freq_range->end_freq_khz, - bw, max_antenna_gain, - power_rule->max_eirp); -#endif -} - static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd, const struct ieee80211_reg_rule *reg_rule, const struct ieee80211_channel *chan) @@ -1242,20 +1197,19 @@ static void handle_channel(struct wiphy *wiphy, if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - REG_DBG_PRINT("Disabling freq %d MHz for good\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz for good\n", + chan->center_freq); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { - REG_DBG_PRINT("Disabling freq %d MHz\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz\n", + chan->center_freq); chan->flags |= IEEE80211_CHAN_DISABLED; } return; } regd = reg_get_regdomain(wiphy); - chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); @@ -1393,18 +1347,15 @@ static bool ignore_reg_update(struct wiphy *wiphy, return true; if (!lr) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since last_request is not set\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since last_request is not set\n", + reg_initiator_name(initiator)); return true; } if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since the driver uses its own custom " - "regulatory domain\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n", + reg_initiator_name(initiator)); return true; } @@ -1415,10 +1366,8 @@ static bool ignore_reg_update(struct wiphy *wiphy, if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd && initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && !is_world_regdom(lr->alpha2)) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since the driver requires its own regulatory " - "domain to be set first\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n", + reg_initiator_name(initiator)); return true; } @@ -1699,7 +1648,7 @@ static void reg_check_chans_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; - REG_DBG_PRINT("Verifying active interfaces after reg change\n"); + pr_debug("Verifying active interfaces after reg change\n"); rtnl_lock(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) @@ -1781,8 +1730,8 @@ static void handle_channel_custom(struct wiphy *wiphy, } if (IS_ERR(reg_rule)) { - REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", + chan->center_freq); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { @@ -1792,8 +1741,6 @@ static void handle_channel_custom(struct wiphy *wiphy, return; } - chan_reg_rule_print_dbg(regd, chan, reg_rule); - power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); @@ -2524,7 +2471,7 @@ static void restore_alpha2(char *alpha2, bool reset_user) if (is_user_regdom_saved()) { /* Unless we're asked to ignore it and reset it */ if (reset_user) { - REG_DBG_PRINT("Restoring regulatory settings including user preference\n"); + pr_debug("Restoring regulatory settings including user preference\n"); user_alpha2[0] = '9'; user_alpha2[1] = '7'; @@ -2534,24 +2481,24 @@ static void restore_alpha2(char *alpha2, bool reset_user) * back as they were for a full restore. */ if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], ieee80211_regdom[1]); + pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } } else { - REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n", - user_alpha2[0], user_alpha2[1]); + pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n", + user_alpha2[0], user_alpha2[1]); alpha2[0] = user_alpha2[0]; alpha2[1] = user_alpha2[1]; } } else if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], ieee80211_regdom[1]); + pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } else - REG_DBG_PRINT("Restoring regulatory settings\n"); + pr_debug("Restoring regulatory settings\n"); } static void restore_custom_reg_settings(struct wiphy *wiphy) @@ -2663,14 +2610,14 @@ static void restore_regulatory_settings(bool reset_user) list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); spin_unlock(®_requests_lock); - REG_DBG_PRINT("Kicking the queue\n"); + pr_debug("Kicking the queue\n"); schedule_work(®_work); } void regulatory_hint_disconnect(void) { - REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n"); + pr_debug("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false); } @@ -2718,10 +2665,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (!reg_beacon) return -ENOMEM; - REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", - beacon_chan->center_freq, - ieee80211_frequency_to_channel(beacon_chan->center_freq), - wiphy_name(wiphy)); + pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, + ieee80211_frequency_to_channel(beacon_chan->center_freq), + wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, sizeof(struct ieee80211_channel)); @@ -2800,8 +2747,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region) case NL80211_DFS_JP: return true; default: - REG_DBG_PRINT("Ignoring uknown DFS master region: %d\n", - dfs_region); + pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region); return false; } } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 8020b5b094d4..544558171787 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, - IEEE80211_BSS_TYPE_ESS, + wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; @@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, wdev->ssid, wdev->ssid_len, - IEEE80211_BSS_TYPE_ESS, + wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (bss) cfg80211_hold_bss(bss_from_pub(bss)); @@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev, bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid, wdev->ssid_len, - IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); + wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; @@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); + /* stop critical protocol if supported */ + if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { + rdev->crit_proto_nlportid = 0; + rdev_crit_proto_stop(rdev, wdev); + } + /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. @@ -1017,6 +1023,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, memcpy(wdev->ssid, connect->ssid, connect->ssid_len); wdev->ssid_len = connect->ssid_len; + wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : + IEEE80211_BSS_TYPE_ESS; + if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else diff --git a/net/wireless/util.c b/net/wireless/util.c index 92770427b211..9f440a9de63b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -393,9 +393,9 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); -unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { - int ae = meshhdr->flags & MESH_FLAGS_AE; + int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: @@ -407,21 +407,31 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) return 18; } } + +unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +{ + return __ieee80211_get_mesh_hdrlen(meshhdr->flags); +} EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); -int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype) +static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr, + const u8 *addr, enum nl80211_iftype iftype) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - u16 hdrlen, ethertype; - u8 *payload; - u8 dst[ETH_ALEN]; - u8 src[ETH_ALEN] __aligned(2); + struct { + u8 hdr[ETH_ALEN] __aligned(2); + __be16 proto; + } payload; + struct ethhdr tmp; + u16 hdrlen; + u8 mesh_flags = 0; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len < hdrlen + 8) + return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header @@ -432,8 +442,11 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ - memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN); - memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN); + memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); + memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); + + if (iftype == NL80211_IFTYPE_MESH_POINT) + skb_copy_bits(skb, hdrlen, &mesh_flags, 1); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { @@ -450,44 +463,31 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, iftype != NL80211_IFTYPE_STATION)) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - struct ieee80211s_hdr *meshdr = - (struct ieee80211s_hdr *) (skb->data + hdrlen); - /* make sure meshdr->flags is on the linear part */ - if (!pskb_may_pull(skb, hdrlen + 1)) - return -1; - if (meshdr->flags & MESH_FLAGS_AE_A4) + if (mesh_flags & MESH_FLAGS_AE_A4) return -1; - if (meshdr->flags & MESH_FLAGS_AE_A5_A6) { + if (mesh_flags & MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), - dst, ETH_ALEN); - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr2), - src, ETH_ALEN); + tmp.h_dest, 2 * ETH_ALEN); } - hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || - (is_multicast_ether_addr(dst) && - ether_addr_equal(src, addr))) + (is_multicast_ether_addr(tmp.h_dest) && + ether_addr_equal(tmp.h_source, addr))) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - struct ieee80211s_hdr *meshdr = - (struct ieee80211s_hdr *) (skb->data + hdrlen); - /* make sure meshdr->flags is on the linear part */ - if (!pskb_may_pull(skb, hdrlen + 1)) - return -1; - if (meshdr->flags & MESH_FLAGS_AE_A5_A6) + if (mesh_flags & MESH_FLAGS_AE_A5_A6) return -1; - if (meshdr->flags & MESH_FLAGS_AE_A4) + if (mesh_flags & MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), - src, ETH_ALEN); - hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + tmp.h_source, ETH_ALEN); + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; case cpu_to_le16(0): @@ -498,33 +498,33 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, break; } - if (!pskb_may_pull(skb, hdrlen + 8)) - return -1; - - payload = skb->data + hdrlen; - ethertype = (payload[6] << 8) | payload[7]; + skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)); + tmp.h_proto = payload.proto; - if (likely((ether_addr_equal(payload, rfc1042_header) && - ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || - ether_addr_equal(payload, bridge_tunnel_header))) { + if (likely((ether_addr_equal(payload.hdr, rfc1042_header) && + tmp.h_proto != htons(ETH_P_AARP) && + tmp.h_proto != htons(ETH_P_IPX)) || + ether_addr_equal(payload.hdr, bridge_tunnel_header))) /* remove RFC1042 or Bridge-Tunnel encapsulation and * replace EtherType */ - skb_pull(skb, hdrlen + 6); - memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN); - } else { - struct ethhdr *ehdr; - __be16 len; + hdrlen += ETH_ALEN + 2; + else + tmp.h_proto = htons(skb->len); - skb_pull(skb, hdrlen); - len = htons(skb->len); + pskb_pull(skb, hdrlen); + + if (!ehdr) ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr)); - memcpy(ehdr->h_dest, dst, ETH_ALEN); - memcpy(ehdr->h_source, src, ETH_ALEN); - ehdr->h_proto = len; - } + memcpy(ehdr, &tmp, sizeof(tmp)); + return 0; } + +int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +{ + return __ieee80211_data_to_8023(skb, NULL, addr, iftype); +} EXPORT_SYMBOL(ieee80211_data_to_8023); int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, @@ -636,7 +636,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, /* Update skb pointers to various headers since this modified frame * is going to go through Linux networking code that may potentially * need things like pointer to IP header. */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, nh_pos); skb_set_transport_header(skb, h_pos); @@ -644,70 +644,147 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, } EXPORT_SYMBOL(ieee80211_data_from_8023); +static void +__frame_add_frag(struct sk_buff *skb, struct page *page, + void *ptr, int len, int size) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + int page_offset; + + atomic_inc(&page->_count); + page_offset = ptr - page_address(page); + skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); +} + +static void +__ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, + int offset, int len) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + const skb_frag_t *frag = &sh->frags[-1]; + struct page *frag_page; + void *frag_ptr; + int frag_len, frag_size; + int head_size = skb->len - skb->data_len; + int cur_len; + + frag_page = virt_to_head_page(skb->head); + frag_ptr = skb->data; + frag_size = head_size; + + while (offset >= frag_size) { + offset -= frag_size; + frag++; + frag_page = skb_frag_page(frag); + frag_ptr = skb_frag_address(frag); + frag_size = skb_frag_size(frag); + } + + frag_ptr += offset; + frag_len = frag_size - offset; + + cur_len = min(len, frag_len); + + __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); + len -= cur_len; + + while (len > 0) { + frag++; + frag_len = skb_frag_size(frag); + cur_len = min(len, frag_len); + __frame_add_frag(frame, skb_frag_page(frag), + skb_frag_address(frag), cur_len, frag_len); + len -= cur_len; + } +} + +static struct sk_buff * +__ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, + int offset, int len, bool reuse_frag) +{ + struct sk_buff *frame; + int cur_len = len; + + if (skb->len - offset < len) + return NULL; + + /* + * When reusing framents, copy some data to the head to simplify + * ethernet header handling and speed up protocol header processing + * in the stack later. + */ + if (reuse_frag) + cur_len = min_t(int, len, 32); + + /* + * Allocate and reserve two bytes more for payload + * alignment since sizeof(struct ethhdr) is 14. + */ + frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); + + skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); + skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); + + len -= cur_len; + if (!len) + return frame; + + offset += cur_len; + __ieee80211_amsdu_copy_frag(skb, frame, offset, len); + + return frame; +} void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, bool has_80211_header) { + unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; u16 ethertype; u8 *payload; - const struct ethhdr *eth; - int remaining, err; - u8 dst[ETH_ALEN], src[ETH_ALEN]; + int offset = 0, remaining, err; + struct ethhdr eth; + bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); + bool reuse_skb = false; + bool last = false; if (has_80211_header) { - err = ieee80211_data_to_8023(skb, addr, iftype); + err = __ieee80211_data_to_8023(skb, ð, addr, iftype); if (err) goto out; - - /* skip the wrapping header */ - eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr)); - if (!eth) - goto out; - } else { - eth = (struct ethhdr *) skb->data; } - while (skb != frame) { + while (!last) { + unsigned int subframe_len; + int len; u8 padding; - __be16 len = eth->h_proto; - unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len); - - remaining = skb->len; - memcpy(dst, eth->h_dest, ETH_ALEN); - memcpy(src, eth->h_source, ETH_ALEN); + skb_copy_bits(skb, offset, ð, sizeof(eth)); + len = ntohs(eth.h_proto); + subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; + /* the last MSDU has no padding */ + remaining = skb->len - offset; if (subframe_len > remaining) goto purge; - skb_pull(skb, sizeof(struct ethhdr)); + offset += sizeof(struct ethhdr); /* reuse skb for the last subframe */ - if (remaining <= subframe_len + padding) + last = remaining <= subframe_len + padding; + if (!skb_is_nonlinear(skb) && !reuse_frag && last) { + skb_pull(skb, offset); frame = skb; - else { - unsigned int hlen = ALIGN(extra_headroom, 4); - /* - * Allocate and reserve two bytes more for payload - * alignment since sizeof(struct ethhdr) is 14. - */ - frame = dev_alloc_skb(hlen + subframe_len + 2); + reuse_skb = true; + } else { + frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, + reuse_frag); if (!frame) goto purge; - skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); - memcpy(skb_put(frame, ntohs(len)), skb->data, - ntohs(len)); - - eth = (struct ethhdr *)skb_pull(skb, ntohs(len) + - padding); - if (!eth) { - dev_kfree_skb(frame); - goto purge; - } + offset += len + padding; } skb_reset_network_header(frame); @@ -716,24 +793,20 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, payload = frame->data; ethertype = (payload[6] << 8) | payload[7]; - if (likely((ether_addr_equal(payload, rfc1042_header) && ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || ether_addr_equal(payload, bridge_tunnel_header))) { - /* remove RFC1042 or Bridge-Tunnel - * encapsulation and replace EtherType */ - skb_pull(frame, 6); - memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); - } else { - memcpy(skb_push(frame, sizeof(__be16)), &len, - sizeof(__be16)); - memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + eth.h_proto = htons(ethertype); + skb_pull(frame, ETH_ALEN + 2); } + + memcpy(skb_push(frame, sizeof(eth)), ð, sizeof(eth)); __skb_queue_tail(list, frame); } + if (!reuse_skb) + dev_kfree_skb(skb); + return; purge: diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index c8717c1d082e..b50ee5d622e1 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -342,6 +342,40 @@ static const int compat_event_type_size[] = { /* IW event code */ +void wireless_nlevent_flush(void) +{ + struct sk_buff *skb; + struct net *net; + + ASSERT_RTNL(); + + for_each_net(net) { + while ((skb = skb_dequeue(&net->wext_nlevents))) + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, + GFP_KERNEL); + } +} +EXPORT_SYMBOL_GPL(wireless_nlevent_flush); + +static int wext_netdev_notifier_call(struct notifier_block *nb, + unsigned long state, void *ptr) +{ + /* + * When a netdev changes state in any way, flush all pending messages + * to avoid them going out in a strange order, e.g. RTM_NEWLINK after + * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close() + * or similar - all of which could otherwise happen due to delays from + * schedule_work(). + */ + wireless_nlevent_flush(); + + return NOTIFY_OK; +} + +static struct notifier_block wext_netdev_notifier = { + .notifier_call = wext_netdev_notifier_call, +}; + static int __net_init wext_pernet_init(struct net *net) { skb_queue_head_init(&net->wext_nlevents); @@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = { static int __init wireless_nlevent_init(void) { - return register_pernet_subsys(&wext_pernet_ops); + int err = register_pernet_subsys(&wext_pernet_ops); + + if (err) + return err; + + return register_netdevice_notifier(&wext_netdev_notifier); } subsys_initcall(wireless_nlevent_init); @@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - struct sk_buff *skb; - struct net *net; - rtnl_lock(); - - for_each_net(net) { - while ((skb = skb_dequeue(&net->wext_nlevents))) - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, - GFP_KERNEL); - } - + wireless_nlevent_flush(); rtnl_unlock(); } |