diff options
Diffstat (limited to 'net')
203 files changed, 9752 insertions, 3302 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a0b2d8b9def7..93eadf179123 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -580,6 +580,7 @@ static int vlan_dev_init(struct net_device *dev) dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; dev->hw_enc_features = vlan_tnl_features(real_dev); + dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/Kconfig b/net/Kconfig index d122f53c6fa2..57f51a279ad6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -67,8 +67,6 @@ source "net/xdp/Kconfig" config INET bool "TCP/IP networking" - select CRYPTO - select CRYPTO_AES ---help--- These are the protocols used on the Internet and on most local Ethernets. It is highly recommended to say Y here (this will enlarge diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c05def8fd9cd..681b72862c16 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -52,6 +52,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; + BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 34fa72c72ad8..d3f9592f4ff8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -47,25 +47,22 @@ static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; -}; #ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly; -static int brnf_filter_pppoe_tagged __read_mostly; -static int brnf_pass_vlan_indev __read_mostly; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 + struct ctl_table_header *ctl_hdr; #endif + /* default value is 1 */ + int call_iptables; + int call_ip6tables; + int call_arptables; + + /* default value is 0 */ + int filter_vlan_tagged; + int filter_pppoe_tagged; + int pass_vlan_indev; +}; + #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) @@ -85,17 +82,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) return 0; } -#define IS_VLAN_IP(skb) \ - (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; +} -#define IS_VLAN_IPV6(skb) \ - (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_VLAN_ARP(skb) \ - (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) + return vlan_proto(skb) == htons(ETH_P_IPV6) && + brnet->filter_vlan_tagged; +} + +static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; +} static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -103,15 +111,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) sizeof(struct pppoe_hdr))); } -#define IS_PPPOE_IP(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) +static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; +} + +static inline bool is_pppoe_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_PPPOE_IPV6(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IPV6) && + brnet->filter_pppoe_tagged; +} /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -408,12 +424,16 @@ bridged_dnat: return 0; } -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, + const struct net_device *dev, + const struct net *net) { struct net_device *vlan, *br; + struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + + if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -423,7 +443,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); @@ -434,7 +454,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; - skb->dev = brnf_get_logical_dev(skb, skb->dev); + skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; @@ -460,6 +480,7 @@ static unsigned int br_nf_pre_routing(void *priv, struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); + struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP; @@ -469,8 +490,10 @@ static unsigned int br_nf_pre_routing(void *priv, return NF_DROP; br = p->br; - if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && + brnet = net_generic(state->net, brnf_net_id); + if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) { + if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; @@ -478,10 +501,11 @@ static unsigned int br_nf_pre_routing(void *priv, return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) + if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; - if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) + if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && + !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); @@ -491,7 +515,7 @@ static unsigned int br_nf_pre_routing(void *priv, if (!nf_bridge_alloc(skb)) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); @@ -514,7 +538,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; - if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; @@ -569,9 +593,11 @@ static unsigned int br_nf_forward_ip(void *priv, if (!parent) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -602,7 +628,7 @@ static unsigned int br_nf_forward_ip(void *priv, skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, - brnf_get_logical_dev(skb, state->in), + brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; @@ -615,23 +641,25 @@ static unsigned int br_nf_forward_arp(void *priv, struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); + struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; - if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) + brnet = net_generic(state->net, brnf_net_id); + if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { - if (!IS_VLAN_ARP(skb)) + if (!is_vlan_arp(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); } if (arp_hdr(skb)->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) + if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } @@ -791,9 +819,11 @@ static unsigned int br_nf_post_routing(void *priv, if (!realoutdev) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -946,23 +976,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_OK; } -static void __net_exit brnf_exit_net(struct net *net) -{ - struct brnf_net *brnet = net_generic(net, brnf_net_id); - - if (!brnet->enabled) - return; - - nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); - brnet->enabled = false; -} - -static struct pernet_operations brnf_net_ops __read_mostly = { - .exit = brnf_exit_net, - .id = &brnf_net_id, - .size = sizeof(struct brnf_net), -}; - static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; @@ -1021,49 +1034,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { } }; + +static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) +{ + brnf->call_iptables = 1; + brnf->call_ip6tables = 1; + brnf->call_arptables = 1; + brnf->filter_vlan_tagged = 0; + brnf->filter_pppoe_tagged = 0; + brnf->pass_vlan_indev = 0; +} + +static int br_netfilter_sysctl_init_net(struct net *net) +{ + struct ctl_table *table = brnf_table; + struct brnf_net *brnet; + + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); + if (!table) + return -ENOMEM; + } + + brnet = net_generic(net, brnf_net_id); + table[0].data = &brnet->call_arptables; + table[1].data = &brnet->call_iptables; + table[2].data = &brnet->call_ip6tables; + table[3].data = &brnet->filter_vlan_tagged; + table[4].data = &brnet->filter_pppoe_tagged; + table[5].data = &brnet->pass_vlan_indev; + + br_netfilter_sysctl_default(brnet); + + brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table); + if (!brnet->ctl_hdr) { + if (!net_eq(net, &init_net)) + kfree(table); + + return -ENOMEM; + } + + return 0; +} + +static void br_netfilter_sysctl_exit_net(struct net *net, + struct brnf_net *brnet) +{ + struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + + unregister_net_sysctl_table(brnet->ctl_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static int __net_init brnf_init_net(struct net *net) +{ + return br_netfilter_sysctl_init_net(net); +} +#endif + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet; + + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) { + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; + } + +#ifdef CONFIG_SYSCTL + br_netfilter_sysctl_exit_net(net, brnet); #endif +} + +static struct pernet_operations brnf_net_ops __read_mostly = { +#ifdef CONFIG_SYSCTL + .init = brnf_init_net, +#endif + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; static int __init br_netfilter_init(void) { @@ -1079,16 +1167,6 @@ static int __init br_netfilter_init(void) return ret; } -#ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { - printk(KERN_WARNING - "br_netfilter: can't register to sysctl.\n"); - unregister_netdevice_notifier(&brnf_notifier); - unregister_pernet_subsys(&brnf_net_ops); - return -ENOMEM; - } -#endif RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; @@ -1099,9 +1177,6 @@ static void __exit br_netfilter_fini(void) RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); -#endif } module_init(br_netfilter_init); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 0e63e5dc5ac4..e4e0c836c3f5 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,7 +224,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, nf_bridge = nf_bridge_alloc(skb); if (!nf_bridge) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 159a0e2cb0f6..e8cf03b43b7d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -421,6 +421,7 @@ struct net_bridge { struct br_input_skb_cb { struct net_device *brdev; + u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c3ad90c43801..f4fb0b9b927d 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -19,6 +19,20 @@ config NF_LOG_BRIDGE tristate "Bridge packet logging" select NF_LOG_COMMON +config NF_CONNTRACK_BRIDGE + tristate "IPv4/IPV6 bridge connection tracking support" + depends on NF_CONNTRACK + default n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. This is used to enhance packet filtering via + stateful policies. Enable this if you want native tracking from + the bridge. This provides a replacement for the `br_netfilter' + infrastructure. + + To compile it as a module, choose M here. If unsure, say N. + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 9b868861f21a..9d7767322a64 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -5,6 +5,9 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o +# connection tracking +obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o + # packet logging obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index eeae23a73c6a..ed91ea31978a 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ebt_nat_info *info = par->targinfo; struct net_device *dev; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 53ef08e6765f..0cad62a4052b 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_redirect_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; if (xt_hooknum(par) != NF_BR_BROUTING) diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 700d338d5ddb..27443bf229a3 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN * 2)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c new file mode 100644 index 000000000000..4f5444d2a526 --- /dev/null +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -0,0 +1,435 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/route.h> +#include <net/ip.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_bridge.h> + +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netfilter/nf_tables.h> + +#include "../br_private.h" + +/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff + * has been linearized or cloned. + */ +static int nf_br_ip_fragment(struct net *net, struct sock *sk, + struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + unsigned int hlen, ll_rs, mtu; + struct ip_frag_state state; + struct iphdr *iph; + int err; + + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + iph = ip_hdr(skb); + + /* + * Setup starting values + */ + + hlen = iph->ihl * 4; + frag_max_size -= hlen; + ll_rs = LL_RESERVED_SPACE(skb->dev); + mtu = skb->dev->mtu; + + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip_fraglist_iter iter; + struct sk_buff *frag; + + if (first_len - hlen > mtu || + skb_headroom(skb) < ll_rs) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + if (frag->len > mtu || + skb_headroom(frag) < hlen + ll_rs) + goto blackhole; + + if (skb_shared(frag)) + goto slow_path; + } + + ip_fraglist_init(skb, iph, hlen, &iter); + + for (;;) { + if (iter.frag) + ip_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip_fraglist_next(&iter); + } + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} + +/* ip_defrag() expects IPCB() in place. */ +static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb, + size_t inet_skb_parm_size) +{ + memcpy(cb, skb->cb, sizeof(*cb)); + memset(skb->cb, 0, inet_skb_parm_size); +} + +static void br_skb_cb_restore(struct sk_buff *skb, + const struct br_input_skb_cb *cb, + u16 fragsz) +{ + memcpy(skb->cb, cb, sizeof(*cb)); + BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz; +} + +static unsigned int nf_ct_br_defrag4(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + if (!ip_is_fragment(ip_hdr(skb))) + return NF_ACCEPT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + local_bh_enable(); + if (!err) { + br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size); + skb->ignore_df = 1; + return NF_ACCEPT; + } + + return NF_STOLEN; +} + +static unsigned int nf_ct_br_defrag6(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm)); + + err = nf_ipv6_br_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + /* queued */ + if (err == -EINPROGRESS) + return NF_STOLEN; + + br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size); + return err == 0 ? NF_ACCEPT : NF_DROP; +} + +static int nf_ct_br_ip_check(const struct sk_buff *skb) +{ + const struct iphdr *iph; + int nhoff, len; + + nhoff = skb_network_offset(skb); + iph = ip_hdr(skb); + if (iph->ihl < 5 || + iph->version != 4) + return -1; + + len = ntohs(iph->tot_len); + if (skb->len < nhoff + len || + len < (iph->ihl * 4)) + return -1; + + return 0; +} + +static int nf_ct_br_ipv6_check(const struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + int nhoff, len; + + nhoff = skb_network_offset(skb); + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return -1; + + len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff; + if (skb->len < len) + return -1; + + return 0; +} + +static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_hook_state bridge_state = *state; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 len; + int ret; + + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || + ctinfo == IP_CT_UNTRACKED) + return NF_ACCEPT; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return NF_ACCEPT; + + len = ntohs(ip_hdr(skb)->tot_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ip_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV4; + ret = nf_ct_br_defrag4(skb, &bridge_state); + break; + case htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return NF_ACCEPT; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ipv6_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV6; + ret = nf_ct_br_defrag6(skb, &bridge_state); + break; + default: + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + if (ret != NF_ACCEPT) + return ret; + + return nf_conntrack_in(skb, &bridge_state); +} + +static void nf_ct_bridge_frag_save(struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data) +{ + if (skb_vlan_tag_present(skb)) { + data->vlan_present = true; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_present = false; + } + skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); +} + +static unsigned int +nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + struct nf_ct_bridge_frag_data data; + + if (!BR_INPUT_SKB_CB(skb)->frag_max_size) + return NF_ACCEPT; + + nf_ct_bridge_frag_save(skb, &data); + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_br_ip_fragment(state->net, state->sk, skb, &data, output); + break; + case htons(ETH_P_IPV6): + nf_br_ip6_fragment(state->net, state->sk, skb, &data, output); + break; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return NF_STOLEN; +} + +/* Actually only slow path refragmentation needs this. */ +static int nf_ct_bridge_frag_restore(struct sk_buff *skb, + const struct nf_ct_bridge_frag_data *data) +{ + int err; + + err = skb_cow_head(skb, ETH_HLEN); + if (err) { + kfree_skb(skb); + return -ENOMEM; + } + if (data->vlan_present) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); + else if (skb_vlan_tag_present(skb)) + __vlan_hwaccel_clear_tag(skb); + + skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); + skb_reset_mac_header(skb); + + return 0; +} + +static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *skb) +{ + int err; + + err = nf_ct_bridge_frag_restore(skb, data); + if (err < 0) + return err; + + return br_dev_queue_push_xmit(net, sk, skb); +} + +static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int protoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (skb->protocol) { + case htons(ETH_P_IP): + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): { + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + } + break; + default: + return NF_ACCEPT; + } + return nf_confirm(skb, protoff, ct, ctinfo); +} + +static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + int ret; + + ret = nf_ct_bridge_confirm(skb); + if (ret != NF_ACCEPT) + return ret; + + return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post); +} + +static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { + { + .hook = nf_ct_bridge_pre, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = nf_ct_bridge_post, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +static struct nf_ct_bridge_info bridge_info = { + .ops = nf_ct_bridge_hook_ops, + .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops), + .me = THIS_MODULE, +}; + +static int __init nf_conntrack_l3proto_bridge_init(void) +{ + nf_ct_bridge_register(&bridge_info); + + return 0; +} + +static void __exit nf_conntrack_l3proto_bridge_fini(void) +{ + nf_ct_bridge_unregister(&bridge_info); +} + +module_init(nf_conntrack_l3proto_bridge_init); +module_exit(nf_conntrack_l3proto_bridge_fini); + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE)); +MODULE_LICENSE("GPL"); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d1c4e1f3be2c..94c7f77ecb6b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; u64 cost; + int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); if (!smap) @@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); nbuckets = 1U << smap->bucket_log; + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); @@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; return &smap->map; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 132f4b757963..4baf716e535e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -17,6 +17,7 @@ #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/refcount.h> +#include <linux/workqueue.h> #include <rdma/ib_verbs.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -2668,6 +2669,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, unsigned long total) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + status_msg)) + goto nla_put_failure; + if (component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, + component, done, total); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); + +void devlink_flash_update_end_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, done, total); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { @@ -4415,6 +4518,35 @@ nla_put_failure: return err; } +static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, + struct netlink_callback *cb, + enum devlink_command cmd) +{ + int index = cb->args[0]; + int tmp_index = index; + void *hdr; + int err; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if ((err && err != -EMSGSIZE) || tmp_index == index) + goto nla_put_failure; + + cb->args[0] = index; + genlmsg_end(skb, hdr); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return err; +} + struct devlink_health_reporter { struct list_head list; void *priv; @@ -4647,17 +4779,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter, EXPORT_SYMBOL_GPL(devlink_health_report); static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) +devlink_health_reporter_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) { struct devlink_health_reporter *reporter; char *reporter_name; - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); mutex_lock(&devlink->reporters_lock); reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); if (reporter) @@ -4666,6 +4797,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return reporter; } +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_health_reporter_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + struct nlattr **attrs; + int err; + + attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return NULL; + + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, + devlink_nl_family.policy, cb->extack); + if (err) + goto free; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto unlock; + + reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + mutex_unlock(&devlink_mutex); + kfree(attrs); + return reporter; +unlock: + mutex_unlock(&devlink_mutex); +free: + kfree(attrs); + return NULL; +} + static void devlink_health_reporter_put(struct devlink_health_reporter *reporter) { @@ -4901,32 +5074,40 @@ out: return err; } -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int +devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) { - struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + u64 start = cb->args[0]; int err; - reporter = devlink_health_reporter_get_from_info(devlink, info); + reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; } - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_fmsg_snd(reporter->dump_fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); + if (!start) { + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto unlock; + cb->args[1] = reporter->dump_ts; + } + if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); + err = -EAGAIN; + goto unlock; + } -out: + err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); +unlock: mutex_unlock(&reporter->dump_lock); +out: devlink_health_reporter_put(reporter); return err; } @@ -5263,7 +5444,7 @@ static const struct genl_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, @@ -5386,6 +5567,38 @@ void devlink_free(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_free); +static void devlink_port_type_warn(struct work_struct *work) +{ + WARN(true, "Type was not set for devlink port."); +} + +static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) +{ + /* Ignore CPU and DSA flavours. */ + return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; +} + +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) + +static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + /* Schedule a work to WARN in case driver does not set port + * type within timeout. + */ + schedule_delayed_work(&devlink_port->type_warn_dw, + DEVLINK_PORT_TYPE_WARN_TIMEOUT); +} + +static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + cancel_delayed_work_sync(&devlink_port->type_warn_dw); +} + /** * devlink_port_register - Register devlink port * @@ -5415,6 +5628,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -5429,6 +5644,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); @@ -5442,6 +5658,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, { if (WARN_ON(!devlink_port->registered)) return; + devlink_port_type_warn_cancel(devlink_port); spin_lock(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; @@ -5515,6 +5732,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); void devlink_port_type_clear(struct devlink_port *devlink_port) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); + devlink_port_type_warn_schedule(devlink_port); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d1011b2e24f..6288e69e94fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case ETHER_FLOW: { + const struct ethhdr *ether_spec, *ether_m_spec; + + ether_spec = &fs->h_u.ether_spec; + ether_m_spec = &fs->m_u.ether_spec; + + if (!is_zero_ether_addr(ether_m_spec->h_source)) { + ether_addr_copy(match->key.eth_addrs.src, + ether_spec->h_source); + ether_addr_copy(match->mask.eth_addrs.src, + ether_m_spec->h_source); + } + if (!is_zero_ether_addr(ether_m_spec->h_dest)) { + ether_addr_copy(match->key.eth_addrs.dst, + ether_spec->h_dest); + ether_addr_copy(match->mask.eth_addrs.dst, + ether_m_spec->h_dest); + } + if (ether_m_spec->h_proto) { + match->key.basic.n_proto = ether_spec->h_proto; + match->mask.basic.n_proto = ether_m_spec->h_proto; + } + } + break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; diff --git a/net/core/filter.c b/net/core/filter.c index f615e42cf4ef..2014d76e0d2a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -62,6 +62,7 @@ #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> @@ -4670,7 +4671,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; - if (res.fi->fib_nhs > 1) + if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { @@ -5694,6 +5695,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, @@ -5896,6 +5937,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } @@ -5933,6 +5978,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -6113,6 +6162,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_sockopt_event_output_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -6800,6 +6857,13 @@ static bool sock_addr_is_valid_access(int off, int size, if (size != size_default) return false; break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; default: if (type == BPF_READ) { if (size != size_default) @@ -6843,6 +6907,11 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; default: if (size != size_default) return false; @@ -7750,6 +7819,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; } return insn - insn_buf; @@ -8009,6 +8083,19 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + break; } return insn - insn_buf; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index edd622956083..01ad60b5aa75 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -199,6 +199,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +void skb_flow_dissect_meta(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_meta *meta; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) + return; + + meta = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_META, + target_container); + meta->ingress_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(skb_flow_dissect_meta); + static void skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, struct flow_dissector *flow_dissector, @@ -757,7 +773,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. - * FLOW_DISSECTOR_F_STOP_AT_L3. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the @@ -922,11 +938,6 @@ proto_again: __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; - } - break; } case htons(ETH_P_IPV6): { @@ -975,9 +986,6 @@ proto_again: __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; } case htons(ETH_P_8021AD): diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 5ce7d47a960e..f52fe0bc4017 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; - rule = kzalloc(sizeof(struct flow_rule) + - sizeof(struct flow_action_entry) * num_actions, + rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); if (!rule) return NULL; @@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc); (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); + void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out) { diff --git a/net/core/hwbm.c b/net/core/hwbm.c index fd822ca5a245..ac1a66df9adc 100644 --- a/net/core/hwbm.c +++ b/net/core/hwbm.c @@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) } EXPORT_SYMBOL_GPL(hwbm_pool_refill); -int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num) { int err, i; - unsigned long flags; - spin_lock_irqsave(&bm_pool->lock, flags); + mutex_lock(&bm_pool->buf_lock); if (bm_pool->buf_num == bm_pool->size) { pr_warn("pool already filled\n"); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return bm_pool->buf_num; } if (buf_num + bm_pool->buf_num > bm_pool->size) { pr_warn("cannot allocate %d buffers for pool\n", buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { pr_warn("Adding %d buffers to the %d current buffers will overflow\n", buf_num, bm_pool->buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } for (i = 0; i < buf_num; i++) { - err = hwbm_pool_refill(bm_pool, gfp); + err = hwbm_pool_refill(bm_pool, GFP_KERNEL); if (err < 0) break; } @@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) bm_pool->buf_num += i; pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return i; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9e7fc929bc50..742cea4ce72e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, int error; struct neigh_hash_table *nht; + trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); + if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 470b179d599e..283ddb2dbc7d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) +#include <trace/events/page_pool.h> +#endif + #include <trace/events/neigh.h> EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 15f68842ac6b..198ce503ae73 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -145,6 +145,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net) } } +static void ops_pre_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + + if (ops->pre_exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->pre_exit(net); + } +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -330,6 +341,12 @@ out_undo: list_add(&net->exit_list, &net_exit_list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + + synchronize_rcu(); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); ops = saved_ops; @@ -541,10 +558,15 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } + /* Run all of the network namespace pre_exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so * the rcu_barrier() below isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. */ synchronize_rcu(); @@ -1101,6 +1123,8 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); return error; @@ -1115,6 +1139,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } @@ -1139,6 +1165,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd8b1a460d64..2cf27da1baeb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np) if (!np->local_ip.ip) { if (!np->ipv6) { + const struct in_ifaddr *ifa; + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) + goto put_noaddr; - if (!in_dev || !in_dev->ifa_list) { + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { +put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } - np->local_ip.ip = in_dev->ifa_list->ifa_local; + np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5b2252c6d49b..b366f59885c1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -4,9 +4,11 @@ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ + #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/device.h> #include <net/page_pool.h> #include <linux/dma-direction.h> @@ -14,6 +16,8 @@ #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <trace/events/page_pool.h> + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -43,6 +47,11 @@ static int page_pool_init(struct page_pool *pool, if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; + atomic_set(&pool->pages_state_release_cnt, 0); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + get_device(pool->p.dev); + return 0; } @@ -151,6 +160,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, page->dma_addr = dma; skip_dma_map: + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + + trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + /* When page just alloc'ed is should/must have refcnt 1. */ return page; } @@ -173,6 +187,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) } EXPORT_SYMBOL(page_pool_alloc_pages); +/* Calculate distance between two u32 values, valid if distance is below 2^(31) + * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution + */ +#define _distance(a, b) (s32)((a) - (b)) + +static s32 page_pool_inflight(struct page_pool *pool) +{ + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; + + distance = _distance(hold_cnt, release_cnt); + + trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); + return distance; +} + +static bool __page_pool_safe_to_destroy(struct page_pool *pool) +{ + s32 inflight = page_pool_inflight(pool); + + /* The distance should not be able to become negative */ + WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + + return (inflight == 0); +} + /* Cleanup page_pool state from page */ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) @@ -180,7 +221,7 @@ static void __page_pool_clean_page(struct page_pool *pool, dma_addr_t dma; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - return; + goto skip_dma_unmap; dma = page->dma_addr; /* DMA unmap */ @@ -188,12 +229,27 @@ static void __page_pool_clean_page(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; +skip_dma_unmap: + atomic_inc(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, + atomic_read(&pool->pages_state_release_cnt)); } +/* unmap the page and clean our state */ +void page_pool_unmap_page(struct page_pool *pool, struct page *page) +{ + /* When page is unmapped, this implies page will not be + * returned to page_pool. + */ + __page_pool_clean_page(pool, page); +} +EXPORT_SYMBOL(page_pool_unmap_page); + /* Return a page to the page allocator, cleaning up our state */ static void __page_pool_return_page(struct page_pool *pool, struct page *page) { __page_pool_clean_page(pool, page); + put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a @@ -285,21 +341,41 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __page_pool_destroy_rcu(struct rcu_head *rcu) +static void __warn_in_flight(struct page_pool *pool) { - struct page_pool *pool; + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; - pool = container_of(rcu, struct page_pool, rcu); + distance = _distance(hold_cnt, release_cnt); + /* Drivers should fix this, but only problematic when DMA is used */ + WARN(1, "Still in-flight pages:%d hold:%u released:%u", + distance, hold_cnt, release_cnt); +} + +void __page_pool_free(struct page_pool *pool) +{ WARN(pool->alloc.count, "API usage violation"); + WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); + + /* Can happen due to forced shutdown */ + if (!__page_pool_safe_to_destroy(pool)) + __warn_in_flight(pool); - __page_pool_empty_ring(pool); ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + kfree(pool); } +EXPORT_SYMBOL(__page_pool_free); -/* Cleanup and release resources */ -void page_pool_destroy(struct page_pool *pool) +/* Request to shutdown: release pages cached by page_pool, and check + * for in-flight pages + */ +bool __page_pool_request_shutdown(struct page_pool *pool) { struct page *page; @@ -317,7 +393,6 @@ void page_pool_destroy(struct page_pool *pool) */ __page_pool_empty_ring(pool); - /* An xdp_mem_allocator can still ref page_pool pointer */ - call_rcu(&pool->rcu, __page_pool_destroy_rcu); + return __page_pool_safe_to_destroy(pool); } -EXPORT_SYMBOL(page_pool_destroy); +EXPORT_SYMBOL(__page_pool_request_shutdown); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f975c5e2a369..bb9915291644 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) rcu_read_lock(); in_dev = __in_dev_get_rcu(pkt_dev->odev); if (in_dev) { - if (in_dev->ifa_list) { - pkt_dev->saddr_min = - in_dev->ifa_list->ifa_address; + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; pkt_dev->saddr_max = pkt_dev->saddr_min; } } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cec60583931f..1ee6460f8275 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) struct nlattr *mx; int i, valid = 0; + /* nothing is dumped for dst_default_metrics, so just skip the loop */ + if (metrics == dst_default_metrics.metrics) + return 0; + mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; @@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * @@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; memset(&ivi, 0, sizeof(ivi)); @@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; @@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || @@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c8cd99c3603f..5323441a12cc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,6 +72,7 @@ #include <linux/highmem.h> #include <linux/capability.h> #include <linux/user_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "datagram.h" @@ -365,19 +366,21 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc; - unsigned long flags; - void *data; + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - local_irq_save(flags); - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, gfp_mask); - local_irq_restore(flags); - return data; + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } +void *napi_alloc_frag(unsigned int fragsz) +{ + fragsz = SKB_DATA_ALIGN(fragsz); + + return __napi_alloc_frag(fragsz, GFP_ATOMIC); +} +EXPORT_SYMBOL(napi_alloc_frag); + /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size @@ -387,26 +390,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - fragsz = SKB_DATA_ALIGN(fragsz); - - return __netdev_alloc_frag(fragsz, GFP_ATOMIC); -} -EXPORT_SYMBOL(netdev_alloc_frag); - -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - return page_frag_alloc(&nc->page, fragsz, gfp_mask); -} + struct page_frag_cache *nc; + void *data; -void *napi_alloc_frag(unsigned int fragsz) -{ fragsz = SKB_DATA_ALIGN(fragsz); - - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); + } else { + local_bh_disable(); + data = __napi_alloc_frag(fragsz, GFP_ATOMIC); + local_bh_enable(); + } + return data; } -EXPORT_SYMBOL(napi_alloc_frag); +EXPORT_SYMBOL(netdev_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -425,7 +423,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; - unsigned long flags; struct sk_buff *skb; bool pfmemalloc; void *data; @@ -446,13 +443,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - local_irq_save(flags); - - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; - - local_irq_restore(flags); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + } else { + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + local_bh_enable(); + } if (unlikely(!data)) return NULL; @@ -909,6 +910,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) } /** + * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg + * @first: first sk_buff of the msg + */ +struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) +{ + struct sk_buff *n; + + n = alloc_skb(0, GFP_ATOMIC); + if (!n) + return NULL; + + n->len = first->len; + n->data_len = first->len; + n->truesize = first->truesize; + + skb_shinfo(n)->frag_list = first; + + __copy_skb_header(n, first); + n->destructor = NULL; + + return n; +} +EXPORT_SYMBOL_GPL(alloc_skb_for_msg); + +/** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents @@ -2508,7 +2534,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = ops->update(skb->data + offset, copy, csum); + csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, + skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -2535,9 +2562,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, frag->page_offset + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = ops->update(vaddr + p_off, p_len, 0); + csum2 = INDIRECT_CALL_1(ops->update, + csum_partial_ext, + vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = ops->combine(csum, csum2, pos, p_len); + csum = INDIRECT_CALL_1(ops->combine, + csum_block_add_ext, csum, + csum2, pos, p_len); pos += p_len; } @@ -2560,7 +2591,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, copy = len; csum2 = __skb_checksum(frag_iter, offset - start, copy, 0, ops); - csum = ops->combine(csum, csum2, pos, copy); + csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, + csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; diff --git a/net/core/sock.c b/net/core/sock.c index aa4a00d381e3..0eb21384079d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1039,6 +1039,10 @@ set_rcvbuf: } break; + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index be6092ac69f8..52d4faeee18b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* Make sure page count doesn't overflow. */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); + err = bpf_map_charge_init(&stab->map.memory, cost); if (err) goto free_stab; @@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (stab->sks) return &stab->map; err = -ENOMEM; + bpf_map_charge_finish(&stab->map.memory); free_stab: kfree(stab); return ERR_PTR(err); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index dc4aefdf2a08..9408f9264d05 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) return 0; } EXPORT_SYMBOL(reuseport_attach_prog); + +int reuseport_detach_prog(struct sock *sk) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return sk->sk_reuseport ? -ENOENT : -EINVAL; + + old_prog = NULL; + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_swap_protected(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); + spin_unlock_bh(&reuseport_lock); + + if (!old_prog) + return -ENOENT; + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_detach_prog); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8aab08b131d9..b29d7b513a18 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,6 +14,8 @@ #include <net/page_pool.h> #include <net/xdp.h> +#include <net/xdp_priv.h> /* struct xdp_mem_allocator */ +#include <trace/events/xdp.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN; static bool mem_id_init; /* false */ static struct rhashtable *mem_id_ht; -struct xdp_mem_allocator { - struct xdp_mem_info mem; - union { - void *allocator; - struct page_pool *page_pool; - struct zero_copy_allocator *zc_alloc; - }; - struct rhash_head node; - struct rcu_head rcu; -}; - static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) { const u32 *k = data; @@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); + /* Allocator have indicated safe to remove before this is called */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + page_pool_free(xa->page_pool); + /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Notice, driver is expected to free the *allocator, - * e.g. page_pool, and MUST also use RCU free. - */ - /* Poison memory */ xa->mem.id = 0xFFFF; xa->mem.type = 0xF0F0; @@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } +bool __mem_id_disconnect(int id, bool force) +{ + struct xdp_mem_allocator *xa; + bool safe_to_remove = true; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); + return true; + } + xa->disconnect_cnt++; + + /* Detects in-flight packet-pages for page_pool */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + safe_to_remove = page_pool_request_shutdown(xa->page_pool); + + trace_mem_disconnect(xa, safe_to_remove, force); + + if ((safe_to_remove || force) && + !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); + return (safe_to_remove|force); +} + +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (30 * HZ) +#define DEFER_MAX_RETRIES 120 + +static void mem_id_disconnect_defer_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); + bool force = false; + + if (xa->disconnect_cnt > DEFER_MAX_RETRIES) + force = true; + + if (__mem_id_disconnect(xa->mem.id, force)) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, xa->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + + pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", + __func__, xa->mem.id, xa->disconnect_cnt, sec); + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); +} + void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; + if (__mem_id_disconnect(id, false)) + return; + + /* Could not disconnect, defer new disconnect attempt to later */ mutex_lock(&mem_id_lock); xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + xa->defer_start = jiffies; + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); mutex_unlock(&mem_id_lock); + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); +/* This unregister operation will also cleanup and destroy the + * allocator. The page_pool_free() operation is first called when it's + * safe to remove, possibly deferred to a workqueue. + */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -301,12 +364,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { + ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); + xdp_rxq->mem.id = 0; errno = PTR_ERR(ptr); goto err; } mutex_unlock(&mem_id_lock); + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; err: mutex_unlock(&mem_id_lock); @@ -333,10 +399,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) { + if (likely(xa)) { napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); } else { + /* Hopefully stack show who to blame for late return */ + WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); + trace_mem_return_failed(mem, page); put_page(page); } rcu_read_unlock(); @@ -379,6 +448,21 @@ void xdp_return_buff(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_return_buff); +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + int xdp_attachment_query(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index d449f78c1bd0..6e942dda1bcd 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -106,6 +106,7 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" select NET_DSA_TAG_8021Q + select PACKING help Say Y or M if you want to enable support for tagging frames with the NXP SJA1105 switch family. Both the native tagging protocol (which diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 820dd8da57fc..3abd173ebacb 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -257,7 +257,7 @@ static int dsa_port_setup(struct dsa_port *dp) enum devlink_port_flavour flavour; struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - int err; + int err = 0; if (dp->type == DSA_PORT_TYPE_UNUSED) return 0; @@ -295,19 +295,15 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_DSA: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_USER: err = dsa_slave_create(dp); @@ -319,7 +315,10 @@ static int dsa_port_setup(struct dsa_port *dp) break; } - return 0; + if (err) + devlink_port_unregister(&dp->devlink_port); + + return err; } static void dsa_port_teardown(struct dsa_port *dp) @@ -347,7 +346,7 @@ static void dsa_port_teardown(struct dsa_port *dp) static int dsa_switch_setup(struct dsa_switch *ds) { - int err; + int err = 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -365,29 +364,41 @@ static int dsa_switch_setup(struct dsa_switch *ds) err = devlink_register(ds->devlink, ds->dev); if (err) - return err; + goto free_devlink; err = dsa_switch_register_notifier(ds); if (err) - return err; + goto unregister_devlink; err = ds->ops->setup(ds); if (err < 0) - return err; + goto unregister_notifier; if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); - if (!ds->slave_mii_bus) - return -ENOMEM; + if (!ds->slave_mii_bus) { + err = -ENOMEM; + goto unregister_notifier; + } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - return err; + goto unregister_notifier; } return 0; + +unregister_notifier: + dsa_switch_unregister_notifier(ds); +unregister_devlink: + devlink_unregister(ds->devlink); +free_devlink: + devlink_free(ds->devlink); + ds->devlink = NULL; + + return err; } static void dsa_switch_teardown(struct dsa_switch *ds) @@ -397,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); + if (ds->ops->teardown) + ds->ops->teardown(ds); + if (ds->devlink) { devlink_unregister(ds->devlink); devlink_free(ds->devlink); @@ -409,8 +423,8 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - int err; + int device, port, i; + int err = 0; for (device = 0; device < DSA_MAX_SWITCHES; device++) { ds = dst->ds[device]; @@ -419,18 +433,41 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) err = dsa_switch_setup(ds); if (err) - return err; + goto switch_teardown; for (port = 0; port < ds->num_ports; port++) { dp = &ds->ports[port]; err = dsa_port_setup(dp); if (err) - return err; + goto ports_teardown; } } return 0; + +ports_teardown: + for (i = 0; i < port; i++) + dsa_port_teardown(&ds->ports[i]); + + dsa_switch_teardown(ds); + +switch_teardown: + for (i = 0; i < device; i++) { + ds = dst->ds[i]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + dsa_port_teardown(dp); + } + + dsa_switch_teardown(ds); + } + + return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) @@ -492,17 +529,24 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_switches(dst); if (err) - return err; + goto teardown_default_cpu; err = dsa_tree_setup_master(dst); if (err) - return err; + goto teardown_switches; dst->setup = true; pr_info("DSA: tree %d setup\n", dst->index); return 0; + +teardown_switches: + dsa_tree_teardown_switches(dst); +teardown_default_cpu: + dsa_tree_teardown_default_cpu(dst); + + return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) @@ -543,8 +587,10 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, dst->ds[index] = ds; err = dsa_tree_setup(dst); - if (err) - dsa_tree_remove_switch(dst, index); + if (err) { + dst->ds[index] = NULL; + dsa_tree_put(dst); + } return err; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3986cedfafc0..b2be53a13aa0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -159,6 +159,23 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state); +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state); +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state); +void dsa_port_phylink_mac_an_restart(struct phylink_config *config); +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface); +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev); +extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 363eab6df51b..d2b65e8dc60c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -336,9 +336,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - /* Can be called from dsa_slave_port_obj_add() or - * dsa_slave_vlan_rx_add_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -354,12 +351,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - if (vlan->obj.orig_dev && netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - - /* Can be called from dsa_slave_port_obj_del() or - * dsa_slave_vlan_rx_kill_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); @@ -418,6 +409,108 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_validate) + return; + + ds->ops->phylink_validate(ds, dp->index, supported, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); + +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + /* Only called for SGMII and 802.3z */ + if (!ds->ops->phylink_mac_link_state) + return -EOPNOTSUPP; + + return ds->ops->phylink_mac_link_state(ds, dp->index, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); + +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_config) + return; + + ds->ops->phylink_mac_config(ds, dp->index, mode, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); + +void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_an_restart) + return; + + ds->ops->phylink_mac_an_restart(ds, dp->index); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); + +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct phy_device *phydev = NULL; + struct dsa_switch *ds = dp->ds; + + if (dsa_is_user_port(ds, dp->index)) + phydev = dp->slave->phydev; + + if (!ds->ops->phylink_mac_link_down) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); + +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_link_up) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); + +const struct phylink_mac_ops dsa_port_phylink_mac_ops = { + .validate = dsa_port_phylink_validate, + .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_config = dsa_port_phylink_mac_config, + .mac_an_restart = dsa_port_phylink_mac_an_restart, + .mac_link_down = dsa_port_phylink_mac_link_down, + .mac_link_up = dsa_port_phylink_mac_link_up, +}; + static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) { struct dsa_switch *ds = dp->ds; @@ -495,8 +588,53 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) return 0; } +static int dsa_port_phylink_register(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + struct device_node *port_dn = dp->dn; + int mode, err; + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + + dp->pl_config.dev = ds->dev; + dp->pl_config.type = PHYLINK_DEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), + mode, &dsa_port_phylink_mac_ops); + if (IS_ERR(dp->pl)) { + pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); + return PTR_ERR(dp->pl); + } + + err = phylink_of_phy_connect(dp->pl, port_dn, 0); + if (err && err != -ENODEV) { + pr_err("could not attach to PHY: %d\n", err); + goto err_phy_connect; + } + + rtnl_lock(); + phylink_start(dp->pl); + rtnl_unlock(); + + return 0; + +err_phy_connect: + phylink_destroy(dp->pl); + return err; +} + int dsa_port_link_register_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) + return dsa_port_phylink_register(dp); + + dev_warn(ds->dev, + "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); + if (of_phy_is_fixed_link(dp->dn)) return dsa_port_fixed_link_register_of(dp); else @@ -505,6 +643,16 @@ int dsa_port_link_register_of(struct dsa_port *dp) void dsa_port_link_unregister_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) { + rtnl_lock(); + phylink_disconnect_phy(dp->pl); + rtnl_unlock(); + phylink_destroy(dp->pl); + return; + } + if (of_phy_is_fixed_link(dp->dn)) of_phy_deregister_fixed_link(dp->dn); else diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8157be7e162d..99673f6b07f6 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,7 +22,7 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(struct net_device *dev); +static bool dsa_slave_dev_check(const struct net_device *dev); /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) @@ -311,7 +311,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans) + struct switchdev_trans *trans, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; @@ -323,6 +324,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -333,6 +336,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), trans); break; @@ -352,6 +357,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -361,6 +368,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: @@ -423,6 +432,8 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; + DSA_SKB_CB(skb)->clone = clone; + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) return; @@ -460,6 +471,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) u64_stats_update_end(&s->syncp); DSA_SKB_CB(skb)->deferred_xmit = false; + DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the * switch driver @@ -1160,98 +1172,6 @@ static struct device_type dsa_type = { .name = "dsa", }; -static void dsa_slave_phylink_validate(struct net_device *dev, - unsigned long *supported, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_validate) - return; - - ds->ops->phylink_validate(ds, dp->index, supported, state); -} - -static int dsa_slave_phylink_mac_link_state(struct net_device *dev, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; - - return ds->ops->phylink_mac_link_state(ds, dp->index, state); -} - -static void dsa_slave_phylink_mac_config(struct net_device *dev, - unsigned int mode, - const struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_config) - return; - - ds->ops->phylink_mac_config(ds, dp->index, mode, state); -} - -static void dsa_slave_phylink_mac_an_restart(struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - -static void dsa_slave_phylink_mac_link_down(struct net_device *dev, - unsigned int mode, - phy_interface_t interface) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); -} - -static void dsa_slave_phylink_mac_link_up(struct net_device *dev, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); -} - -static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = { - .validate = dsa_slave_phylink_validate, - .mac_link_state = dsa_slave_phylink_mac_link_state, - .mac_config = dsa_slave_phylink_mac_config, - .mac_an_restart = dsa_slave_phylink_mac_an_restart, - .mac_link_down = dsa_slave_phylink_mac_link_down, - .mac_link_up = dsa_slave_phylink_mac_link_up, -}; - void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); @@ -1299,8 +1219,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) if (mode < 0) mode = PHY_INTERFACE_MODE_NA; - dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode, - &dsa_slave_phylink_mac_ops); + dp->pl_config.dev = &slave_dev->dev; + dp->pl_config.type = PHYLINK_NETDEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, + &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { netdev_err(slave_dev, "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); @@ -1494,7 +1417,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(struct net_device *dev) +static bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } @@ -1565,19 +1488,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, return NOTIFY_DONE; } -static int -dsa_slave_switchdev_port_attr_set_event(struct net_device *netdev, - struct switchdev_notifier_port_attr_info *port_attr_info) -{ - int err; - - err = dsa_slave_port_attr_set(netdev, port_attr_info->attr, - port_attr_info->trans); - - port_attr_info->handled = true; - return notifier_from_errno(err); -} - struct dsa_switchdev_event_work { struct work_struct work; struct switchdev_notifier_fdb_info fdb_info; @@ -1652,13 +1562,18 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct dsa_switchdev_event_work *switchdev_work; + int err; + + if (event == SWITCHDEV_PORT_ATTR_SET) { + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); + } if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; - if (event == SWITCHDEV_PORT_ATTR_SET) - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return NOTIFY_BAD; @@ -1688,41 +1603,28 @@ err_fdb_work_init: return NOTIFY_BAD; } -static int -dsa_slave_switchdev_port_obj_event(unsigned long event, - struct net_device *netdev, - struct switchdev_notifier_port_obj_info *port_obj_info) -{ - int err = -EOPNOTSUPP; - - switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: - err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, - port_obj_info->trans); - break; - case SWITCHDEV_PORT_OBJ_DEL: - err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); - break; - } - - port_obj_info->handled = true; - return notifier_from_errno(err); -} - static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - - if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + int err; switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_ADD: + err = switchdev_handle_port_obj_add(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_add); + return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: - return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + err = switchdev_handle_port_obj_del(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_del); + return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); } return NOTIFY_DONE; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 65a35e976d7b..6ebbd799c4eb 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, - struct packet_type *pt, u16 *tpid, u16 *tci) +/* In the DSA packet_type handler, skb->data points in the middle of the VLAN + * tag, after tpid and before tci. This is because so far, ETH_HLEN + * (DMAC, SMAC, EtherType) bytes were pulled. + * There are 2 bytes of VLAN tag left in skb->data, and upper + * layers expect the 'real' EtherType to be consumed as well. + * Coincidentally, a VLAN header is also of the same size as + * the number of bytes that need to be pulled. + * + * skb_mac_header skb->data + * | | + * v v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+-------+-------+-------+ + * | Destination MAC | Source MAC | TPID | TCI | EType | + * +-----------------------+-----------------------+-------+-------+-------+ + * ^ | | + * |<--VLAN_HLEN-->to <---VLAN_HLEN---> + * from | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ ^ + * (now part of | | + * skb->head) skb_mac_header skb->data + */ +struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) { - struct vlan_ethhdr *tag; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) - return NULL; + u8 *from = skb_mac_header(skb); + u8 *dest = from + VLAN_HLEN; - tag = vlan_eth_hdr(skb); - *tpid = ntohs(tag->h_vlan_proto); - *tci = ntohs(tag->h_vlan_TCI); - - /* skb->data points in the middle of the VLAN tag, - * after tpid and before tci. This is because so far, - * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - */ - skb_pull_rcsum(skb, VLAN_HLEN); + memmove(dest, from, ETH_HLEN - VLAN_HLEN); + skb_pull(skb, VLAN_HLEN); + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + skb_pull_rcsum(skb, ETH_HLEN); return skb; } -EXPORT_SYMBOL_GPL(dsa_8021q_rcv); +EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); static const struct dsa_device_ops dsa_8021q_netdev_ops = { .name = "8021q", diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index d43737e6c3fb..1d96c9d4a8e9 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -13,6 +13,8 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) const struct ethhdr *hdr = eth_hdr(skb); u64 dmac = ether_addr_to_u64(hdr->h_dest); + if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META) + return false; if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) == SJA1105_LINKLOCAL_FILTER_A) return true; @@ -22,15 +24,61 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) return false; } +struct sja1105_meta { + u64 tstamp; + u64 dmac_byte_4; + u64 dmac_byte_3; + u64 source_port; + u64 switch_id; +}; + +static void sja1105_meta_unpack(const struct sk_buff *skb, + struct sja1105_meta *meta) +{ + u8 *buf = skb_mac_header(skb) + ETH_HLEN; + + /* UM10944.pdf section 4.2.17 AVB Parameters: + * Structure of the meta-data follow-up frame. + * It is in network byte order, so there are no quirks + * while unpacking the meta frame. + * + * Also SJA1105 E/T only populates bits 23:0 of the timestamp + * whereas P/Q/R/S does 32 bits. Since the structure is the + * same and the E/T puts zeroes in the high-order byte, use + * a unified unpacking command for both device series. + */ + packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0); + packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0); + packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0); + packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0); + packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); +} + +static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) +{ + const struct ethhdr *hdr = eth_hdr(skb); + u64 smac = ether_addr_to_u64(hdr->h_source); + u64 dmac = ether_addr_to_u64(hdr->h_dest); + + if (smac != SJA1105_META_SMAC) + return false; + if (dmac != SJA1105_META_DMAC) + return false; + if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META) + return false; + return true; +} + /* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it, and if we can, annotate skb->cb with how we - * plan to do that, so we don't need to check again in the rcv function. + * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + return true; if (sja1105_is_link_local(skb)) return true; - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_is_meta_frame(skb)) return true; return false; } @@ -62,25 +110,152 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static void sja1105_transfer_meta(struct sk_buff *skb, + const struct sja1105_meta *meta) +{ + struct ethhdr *hdr = eth_hdr(skb); + + hdr->h_dest[3] = meta->dmac_byte_3; + hdr->h_dest[4] = meta->dmac_byte_4; + SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; +} + +/* This is a simple state machine which follows the hardware mechanism of + * generating RX timestamps: + * + * After each timestampable skb (all traffic for which send_meta1 and + * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame + * containing a partial timestamp is immediately generated by the switch and + * sent as a follow-up to the link-local frame on the CPU port. + * + * The meta frames have no unique identifier (such as sequence number) by which + * one may pair them to the correct timestampable frame. + * Instead, the switch has internal logic that ensures no frames are sent on + * the CPU port between a link-local timestampable frame and its corresponding + * meta follow-up. It also ensures strict ordering between ports (lower ports + * have higher priority towards the CPU port). For this reason, a per-port + * data structure is not needed/desirable. + * + * This function pairs the link-local frame with its partial timestamp from the + * meta follow-up frame. The full timestamp will be reconstructed later in a + * work queue. + */ +static struct sk_buff +*sja1105_rcv_meta_state_machine(struct sk_buff *skb, + struct sja1105_meta *meta, + bool is_link_local, + bool is_meta) +{ + struct sja1105_port *sp; + struct dsa_port *dp; + + dp = dsa_slave_to_port(skb->dev); + sp = dp->priv; + + /* Step 1: A timestampable frame was received. + * Buffer it until we get its meta frame. + */ + if (is_link_local && sp->data->hwts_rx_en) { + spin_lock(&sp->data->meta_lock); + /* Was this a link-local frame instead of the meta + * that we were expecting? + */ + if (sp->data->stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Expected meta frame, is %12llx " + "in the DSA master multicast filter?\n", + SJA1105_META_DMAC); + } + + /* Hold a reference to avoid dsa_switch_rcv + * from freeing the skb. + */ + sp->data->stampable_skb = skb_get(skb); + spin_unlock(&sp->data->meta_lock); + + /* Tell DSA we got nothing */ + return NULL; + + /* Step 2: The meta frame arrived. + * Time to take the stampable skb out of the closet, annotate it + * with the partial timestamp, and pretend that we received it + * just now (basically masquerade the buffered frame as the meta + * frame, which serves no further purpose). + */ + } else if (is_meta) { + struct sk_buff *stampable_skb; + + spin_lock(&sp->data->meta_lock); + + stampable_skb = sp->data->stampable_skb; + sp->data->stampable_skb = NULL; + + /* Was this a meta frame instead of the link-local + * that we were expecting? + */ + if (!stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Unexpected meta frame\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + if (stampable_skb->dev != skb->dev) { + dev_err_ratelimited(dp->ds->dev, + "Meta frame on wrong port\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + /* Free the meta frame and give DSA the buffered stampable_skb + * for further processing up the network stack. + */ + kfree_skb(skb); + + skb = skb_copy(stampable_skb, GFP_ATOMIC); + if (!skb) { + dev_err_ratelimited(dp->ds->dev, + "Failed to copy stampable skb\n"); + return NULL; + } + sja1105_transfer_meta(skb, meta); + /* The cached copy will be freed now */ + skb_unref(stampable_skb); + + spin_unlock(&sp->data->meta_lock); + } + + return skb; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - struct ethhdr *hdr = eth_hdr(skb); - u64 source_port, switch_id; - struct sk_buff *nskb; + struct sja1105_meta meta = {0}; + int source_port, switch_id; + struct vlan_ethhdr *hdr; u16 tpid, vid, tci; + bool is_link_local; bool is_tagged; + bool is_meta; - nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci); - is_tagged = (nskb && tpid == ETH_P_SJA1105); - - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - vid = tci & VLAN_VID_MASK; + hdr = vlan_eth_hdr(skb); + tpid = ntohs(hdr->h_vlan_proto); + is_tagged = (tpid == ETH_P_SJA1105); + is_link_local = sja1105_is_link_local(skb); + is_meta = sja1105_is_meta_frame(skb); skb->offload_fwd_mark = 1; - if (sja1105_is_link_local(skb)) { + if (is_tagged) { + /* Normal traffic path. */ + tci = ntohs(hdr->h_vlan_TCI); + vid = tci & VLAN_VID_MASK; + source_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of * the incl_srcpt options. @@ -90,10 +265,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, /* Clear the DMAC bytes that were mangled by the switch */ hdr->h_dest[3] = 0; hdr->h_dest[4] = 0; + } else if (is_meta) { + sja1105_meta_unpack(skb, &meta); + source_port = meta.source_port; + switch_id = meta.switch_id; } else { - /* Normal traffic path. */ - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); + return NULL; } skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); @@ -106,10 +283,10 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). */ if (is_tagged) - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN, - ETH_HLEN - VLAN_HLEN); + skb = dsa_8021q_remove_header(skb); - return skb; + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, + is_meta); } static struct dsa_device_ops sja1105_netdev_ops = { diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 20b0bcb7e9e3..17374afee28f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -545,17 +545,10 @@ unsigned char * __weak arch_get_platform_mac_address(void) int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { - const unsigned char *addr; - struct device_node *dp; + const unsigned char *addr = NULL; - if (dev_is_pci(dev)) - dp = pci_device_to_OF_node(to_pci_dev(dev)); - else - dp = dev->of_node; - - addr = NULL; - if (dp) - addr = of_get_mac_address(dp); + if (dev->of_node) + addr = of_get_mac_address(dev->of_node); if (IS_ERR_OR_NULL(addr)) addr = arch_get_platform_mac_address(); @@ -563,6 +556,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return -ENODEV; ether_addr_copy(mac_addr, addr); + return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 9133e3cede77..e4aba5d485be 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -74,7 +74,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, key.src = *src; key.dst = *dst; - q = inet_frag_find(&ieee802154_lowpan->frags, &key); + q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; @@ -134,7 +134,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { @@ -321,23 +321,18 @@ err: static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", - .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", - .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", - .data = &init_net.ieee802154_lowpan.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -372,17 +367,17 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) if (table == NULL) goto err_alloc; - table[0].data = &ieee802154_lowpan->frags.high_thresh; - table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; - table[1].data = &ieee802154_lowpan->frags.low_thresh; - table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; - table[2].data = &ieee802154_lowpan->frags.timeout; - /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table[0].procname = NULL; } + table[0].data = &ieee802154_lowpan->fqdir->high_thresh; + table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; + table[1].data = &ieee802154_lowpan->fqdir->low_thresh; + table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; + table[2].data = &ieee802154_lowpan->fqdir->timeout; + hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); if (hdr == NULL) goto err_reg; @@ -449,32 +444,42 @@ static int __net_init lowpan_frags_init_net(struct net *net) net_ieee802154_lowpan(net); int res; - ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; - ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - ieee802154_lowpan->frags.f = &lowpan_frags; - res = inet_frags_init_net(&ieee802154_lowpan->frags); + res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; + + ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = lowpan_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); return res; } +static void __net_exit lowpan_frags_pre_exit_net(struct net *net) +{ + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + fqdir_pre_exit(ieee802154_lowpan->fqdir); +} + static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { - .init = lowpan_frags_init_net, - .exit = lowpan_frags_exit_net, + .init = lowpan_frags_init_net, + .pre_exit = lowpan_frags_pre_exit_net, + .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) @@ -539,7 +544,7 @@ err_sysctl: void lowpan_net_frag_exit(void) { - inet_frags_fini(&lowpan_frags); lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); + inet_frags_fini(&lowpan_frags); } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 000a61994c8f..d57ecfaf89d4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o + metrics.o netlink.o nexthop.o obj-$(CONFIG_BPFILTER) += bpfilter/ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c6bd0f7a020a..137d1892395d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -190,7 +190,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); @@ -296,8 +297,8 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { - struct in_ifaddr *ifa; struct net_device *dev; + struct in_ifaddr *ifa; ASSERT_RTNL(); @@ -307,7 +308,7 @@ static void inetdev_destroy(struct in_device *in_dev) ip_mc_destroy_dev(in_dev); - while ((ifa = in_dev->ifa_list) != NULL) { + while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } @@ -323,30 +324,35 @@ static void inetdev_destroy(struct in_device *in_dev) int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { + const struct in_ifaddr *ifa; + rcu_read_lock(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } - } endfor_ifa(in_dev); + } rcu_read_unlock(); return 0; } -static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 portid) +static void __inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa, *ifa1 = *ifap; - struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *ifa, *ifa1; + struct in_ifaddr *last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); + ifa1 = rtnl_dereference(*ifap); + last_prim = rtnl_dereference(in_dev->ifa_list); if (in_dev->dead) goto no_promotions; @@ -355,9 +361,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr **ifap1 = &ifa1->ifa_next; + struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; - while ((ifa = *ifap1) != NULL) { + while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -390,7 +396,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ - for (ifa = promote; ifa; ifa = ifa->ifa_next) { + for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); @@ -416,19 +422,25 @@ no_promotions: blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { - struct in_ifaddr *next_sec = promote->ifa_next; + struct in_ifaddr *next_sec; + next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { - prev_prom->ifa_next = promote->ifa_next; - promote->ifa_next = last_prim->ifa_next; - last_prim->ifa_next = promote; + struct in_ifaddr *last_sec; + + rcu_assign_pointer(prev_prom->ifa_next, next_sec); + + last_sec = rtnl_dereference(last_prim->ifa_next); + rcu_assign_pointer(promote->ifa_next, last_sec); + rcu_assign_pointer(last_prim->ifa_next, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; + ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -440,7 +452,8 @@ no_promotions: inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -453,9 +466,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { + struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; struct in_validator_info ivi; + struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); @@ -468,8 +482,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + ifap = &in_dev->ifa_list; + ifa1 = rtnl_dereference(*ifap); + + while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; @@ -485,6 +501,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + + ifap = &ifa1->ifa_next; + ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh @@ -510,8 +529,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifap = last_primary; } - ifa->ifa_next = *ifap; - *ifap = ifa; + rcu_assign_pointer(ifa->ifa_next, *ifap); + rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); @@ -576,12 +595,14 @@ EXPORT_SYMBOL(inetdev_by_index); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { + struct in_ifaddr *ifa; + ASSERT_RTNL(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; - } endfor_ifa(in_dev); + } return NULL; } @@ -609,10 +630,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; - struct in_ifaddr *ifa, **ifap; + struct in_ifaddr *ifa; + int err = -EINVAL; ASSERT_RTNL(); @@ -629,7 +652,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -717,15 +740,19 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap; + struct in_ifaddr __rcu **ifap; + struct in_ifaddr *tmp; - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &(*ifap)->ifa_next) { - if (*ifap == ifa) { + ifap = &ifa->ifa_dev->ifa_list; + tmp = rtnl_dereference(*ifap); + while (tmp) { + if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } + ifap = &tmp->ifa_next; + tmp = rtnl_dereference(*ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -869,13 +896,12 @@ errout: static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap; + struct in_ifaddr *ifa1; if (!ifa->ifa_local) return NULL; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -970,8 +996,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; + struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; - struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; @@ -1042,7 +1068,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1055,7 +1083,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; @@ -1204,7 +1233,7 @@ out: static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; @@ -1214,7 +1243,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s if (!in_dev) goto out; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (!buf) { done += size; continue; @@ -1242,18 +1271,24 @@ out: static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; - } endfor_ifa(in_dev); + } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { + const struct in_ifaddr *ifa; __be32 addr = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net = dev_net(dev); int master_idx; @@ -1263,8 +1298,13 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!in_dev) goto no_in_dev; - for_primary_ifa(in_dev) { - if (ifa->ifa_scope > scope) + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; @@ -1272,7 +1312,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) } if (!addr) addr = ifa->ifa_local; - } endfor_ifa(in_dev); + } if (addr) goto out_unlock; @@ -1317,13 +1357,20 @@ EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { - int same = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; + const struct in_ifaddr *ifa; __be32 addr = 0; + int same = 0; + + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); - for_ifa(in_dev) { if (!addr && (local == ifa->ifa_local || !local) && - ifa->ifa_scope <= scope) { + min_scope <= scope) { addr = ifa->ifa_local; if (same) break; @@ -1338,7 +1385,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ - if (ifa->ifa_scope <= scope) { + if (min_scope <= scope) { addr = ifa->ifa_local; break; } @@ -1346,7 +1393,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, same = 0; } } - } endfor_ifa(in_dev); + } return same ? addr : 0; } @@ -1420,7 +1467,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) struct in_ifaddr *ifa; int named = 0; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); @@ -1450,10 +1497,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; - for (ifa = in_dev->ifa_list; ifa; - ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, @@ -1723,15 +1769,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, int ip_idx = 0; int err; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (ip_idx < s_ip_idx) { + ip_idx++; continue; - + } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; } err = 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e54c2bcbb465..317339cd7f03 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -39,6 +39,7 @@ #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> @@ -188,7 +189,7 @@ int fib_unmerge(struct net *net) return 0; } -static void fib_flush(struct net *net) +void fib_flush(struct net *net) { int flushed = 0; unsigned int h; @@ -230,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { - if (!dev || dev == res.fi->fib_dev) + struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); + + if (!dev || dev == nhc->nhc_dev) ret = res.type; } } @@ -317,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) #ifdef CONFIG_IP_ROUTE_MULTIPATH int ret; - for (ret = 0; ret < fi->fib_nhs; ret++) { - struct fib_nh *nh = &fi->fib_nh[ret]; + for (ret = 0; ret < fib_info_num_path(fi); ret++) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); - if (nh->fib_nh_dev == dev) { + if (nhc->nhc_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].fib_nh_dev == dev) + if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif @@ -536,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { - struct in_ifaddr *ifa; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; + *colon = ':'; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; + } + rcu_read_unlock(); + if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; @@ -641,6 +652,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, @@ -659,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, @@ -796,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (err < 0) goto errout; break; + case RTA_NH_ID: + cfg->fc_nh_id = nla_get_u32(attr); + break; + } + } + + if (cfg->fc_nh_id) { + if (cfg->fc_oif || cfg->fc_gw_family || + cfg->fc_encap || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + return -EINVAL; } } @@ -822,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + err = -EINVAL; + goto errout; + } + tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); @@ -881,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & RTM_F_CLONED) + filter->dump_routes = false; + else + filter->dump_exceptions = false; filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); filter->flags = rtm->rtm_flags; @@ -931,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct fib_dump_filter filter = { .dump_routes = true, + .dump_exceptions = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct fib_dump_filter filter = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; @@ -950,8 +987,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } - /* fib entries are never clones and ipv4 does not use prefix flag */ - if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED)) + /* ipv4 does not use prefix flag */ + if (filter.flags & RTM_F_PREFIX) return skb->len; if (filter.table_id) { @@ -1172,8 +1209,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) * * Scan address list to be sure that addresses are really gone. */ - - for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; @@ -1241,6 +1278,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } } + rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) @@ -1410,6 +1448,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); + struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { @@ -1424,9 +1463,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_UP: - for_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); - } endfor_ifa(in_dev); + } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 7945f0534db7..a68b5e21ec51 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> +#include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a38e86b98e4f..b43a7ba5c6a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -27,6 +27,7 @@ #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/fib_rules.h> struct fib4_rule { @@ -141,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; - if (result->fi) - dev = result->fi->fib_dev; + if (result->fi) { + struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); + + dev = nhc->nhc_dev; + } /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bfa49a88d03a..2db089e10ba0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -38,6 +38,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> +#include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> @@ -56,18 +57,21 @@ static unsigned int fib_info_cnt; #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +/* for_nexthops and change_nexthops only used when nexthop object + * is not set in a fib_info. The logic within can reference fib_nh. + */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -228,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - change_nexthops(fi) { - fib_nh_release(fi->fib_net, nexthop_nh); - } endfor_nexthops(fi); + if (fi->nh) { + nexthop_put(fi->nh); + } else { + change_nexthops(fi) { + fib_nh_release(fi->fib_net, nexthop_nh); + } endfor_nexthops(fi); + } ip_fib_metrics_put(fi->fib_metrics); @@ -256,22 +264,34 @@ void fib_release_info(struct fib_info *fi) hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); - change_nexthops(fi) { - if (!nexthop_nh->fib_nh_dev) - continue; - hlist_del(&nexthop_nh->nh_hash); - } endfor_nexthops(fi) + if (fi->nh) { + list_del(&fi->nh_list); + } else { + change_nexthops(fi) { + if (!nexthop_nh->fib_nh_dev) + continue; + hlist_del(&nexthop_nh->nh_hash); + } endfor_nexthops(fi) + } fi->fib_dead = 1; fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); } -static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { - const struct fib_nh *onh = ofi->fib_nh; + const struct fib_nh *onh; + + if (fi->nh || ofi->nh) + return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; + + if (ofi->fib_nhs == 0) + return 0; for_nexthops(fi) { + onh = fib_info_nh(ofi, nhsel); + if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || @@ -292,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; - - onh++; } endfor_nexthops(fi); return 0; } @@ -307,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) (val >> (DEVINDEX_HASHBITS * 2))) & mask; } -static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, + u32 prefsrc, u32 priority) { - unsigned int mask = (fib_info_hash_size - 1); - unsigned int val = fi->fib_nhs; + unsigned int val = init_val; - val ^= (fi->fib_protocol << 8) | fi->fib_scope; - val ^= (__force u32)fi->fib_prefsrc; - val ^= fi->fib_priority; - for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); - } endfor_nexthops(fi) + val ^= (protocol << 8) | scope; + val ^= prefsrc; + val ^= priority; + + return val; +} + +static unsigned int fib_info_hashfn_result(unsigned int val) +{ + unsigned int mask = (fib_info_hash_size - 1); return (val ^ (val >> 7) ^ (val >> 12)) & mask; } -static struct fib_info *fib_find_info(const struct fib_info *nfi) +static inline unsigned int fib_info_hashfn(struct fib_info *fi) +{ + unsigned int val; + + val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, + fi->fib_scope, (__force u32)fi->fib_prefsrc, + fi->fib_priority); + + if (fi->nh) { + val ^= fib_devindex_hashfn(fi->nh->id); + } else { + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->fib_nh_oif); + } endfor_nexthops(fi) + } + + return fib_info_hashfn_result(val); +} + +/* no metrics, only nexthop id */ +static struct fib_info *fib_find_info_nh(struct net *net, + const struct fib_config *cfg) +{ + struct hlist_head *head; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + cfg->fc_protocol, cfg->fc_scope, + (__force u32)cfg->fc_prefsrc, + cfg->fc_priority); + hash = fib_info_hashfn_result(hash); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, head, fib_hash) { + if (!net_eq(fi->fib_net, net)) + continue; + if (!fi->nh || fi->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_protocol == fi->fib_protocol && + cfg->fc_scope == fi->fib_scope && + cfg->fc_prefsrc == fi->fib_prefsrc && + cfg->fc_priority == fi->fib_priority && + cfg->fc_type == fi->fib_type && + cfg->fc_table == fi->fib_tb_id && + !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) + return fi; + } + + return NULL; +} + +static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head; struct fib_info *fi; @@ -344,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && - (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + nh_comp(fi, nfi) == 0) return fi; } @@ -386,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); - if (fi->fib_nhs) { + if (fi->nh) + payload += nla_total_size(4); /* RTA_NH_ID */ + + if (nhs) { size_t nh_encapsize = 0; - /* Also handles the special case fib_nhs == 1 */ + /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ - for_nexthops(fi) { - if (nh->fib_nh_lws) { + for (i = 0; i < fib_info_num_path(fi); i++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, i); + + if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->fib_nh_lws); + nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } - } endfor_nexthops(fi); + } /* all nexthops are packed in a nested attribute */ - payload += nla_total_size((fi->fib_nhs * nhsize) + - nh_encapsize); + payload += nla_total_size((nhs * nhsize) + nh_encapsize); } @@ -574,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, return nhs; } +/* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; + struct fib_nh *nh; int ret; change_nexthops(fi) { @@ -642,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, } endfor_nexthops(fi); ret = -EINVAL; - if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { + nh = fib_info_nh(fi, 0); + if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { - if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && - fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; @@ -670,12 +753,13 @@ errout: return ret; } +/* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; - if (fi->fib_nhs < 2) + if (fib_info_num_path(fi) < 2) return; total = 0; @@ -756,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; + if (cfg->fc_nh_id) { + if (fi->nh && cfg->fc_nh_id == fi->nh->id) + return 0; + return 1; + } + if (cfg->fc_oif || cfg->fc_gw_family) { + struct fib_nh *nh = fib_info_nh(fi, 0); + if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, - fi->fib_nh, cfg, extack)) + nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && - cfg->fc_flow != fi->fib_nh->nh_tclassid) + cfg->fc_flow != nh->nh_tclassid) return 1; #endif - if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && - cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && - cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; @@ -1088,15 +1180,13 @@ out: return err; } -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, + struct netlink_ext_ack *extack) { - struct net *net = cfg->fc_nlinfo.nl_net; - u32 table = cfg->fc_table; int err; if (nh->fib_nh_gw_family == AF_INET) - err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else @@ -1187,11 +1277,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } -__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, + unsigned char scope) { - nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, - nh->fib_nh_gw4, - nh->nh_parent->fib_scope); + struct fib_nh *nh; + + if (nhc->nhc_family != AF_INET) + return inet_select_addr(nhc->nhc_dev, 0, scope); + + nh = container_of(nhc, struct fib_nh, nh_common); + nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; @@ -1200,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; - struct fib_nh *nh; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; - nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + } - return fib_info_update_nh_saddr(net, nh); + return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) @@ -1241,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, { int err; struct fib_info *fi = NULL; + struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1260,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } + if (cfg->fc_nh_id) { + if (!cfg->fc_mx) { + fi = fib_find_info_nh(net, cfg); + if (fi) { + fi->fib_treeref++; + return fi; + } + } + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + nhs = 0; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); @@ -1295,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, extack); - if (unlikely(IS_ERR(fi->fib_metrics))) { + if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); @@ -1312,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; - change_nexthops(fi) { - nexthop_nh->nh_parent = fi; - } endfor_nexthops(fi) + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = 0; + fi->nh = nh; + } + } else { + change_nexthops(fi) { + nexthop_nh->nh_parent = fi; + } endfor_nexthops(fi) - if (cfg->fc_mp) - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - else - err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + if (cfg->fc_mp) + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, + extack); + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + } if (err != 0) goto failure; @@ -1350,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - if (cfg->fc_scope == RT_SCOPE_HOST) { + if (fi->nh) { + err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); + if (err) + goto failure; + } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ @@ -1365,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; - nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); + nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; @@ -1373,7 +1504,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, nexthop_nh, extack); + err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, + cfg->fc_table, cfg->fc_scope, + extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) @@ -1388,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - change_nexthops(fi) { - fib_info_update_nh_saddr(net, nexthop_nh); - if (nexthop_nh->fib_nh_gw_family == AF_INET6) - fi->fib_nh_is_v6 = true; - } endfor_nexthops(fi) + if (!fi->nh) { + change_nexthops(fi) { + fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, + fi->fib_scope); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; + } endfor_nexthops(fi) - fib_rebalance(fi); + fib_rebalance(fi); + } link_it: ofi = fib_find_info(fi); @@ -1416,16 +1552,20 @@ link_it: head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; hlist_add_head(&fi->fib_lhash, head); } - change_nexthops(fi) { - struct hlist_head *head; - unsigned int hash; + if (fi->nh) { + list_add(&fi->nh_list, &nh->fi_list); + } else { + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; - if (!nexthop_nh->fib_nh_dev) - continue; - hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_add_head(&nexthop_nh->nh_hash, head); - } endfor_nexthops(fi) + if (!nexthop_nh->fib_nh_dev) + continue; + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nexthop_nh->nh_hash, head); + } endfor_nexthops(fi) + } spin_unlock_bh(&fib_info_lock); return fi; @@ -1552,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) if (!mp) goto nla_put_failure; + if (unlikely(fi->nh)) { + if (nexthop_mpath_fill_node(skb, fi->nh) < 0) + goto nla_put_failure; + goto mp_end; + } + for_nexthops(fi) { if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) goto nla_put_failure; @@ -1562,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif } endfor_nexthops(fi); +mp_end: nla_nest_end(skb, mp); return 0; @@ -1580,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { + unsigned int nhs = fib_info_num_path(fi); struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1615,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; - if (fi->fib_nhs == 1) { - struct fib_nh *nh = &fi->fib_nh[0]; + + if (fi->nh) { + if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) + goto nla_put_failure; + if (nexthop_is_blackhole(fi->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + } + + if (nhs == 1) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; - if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) + if (fib_nexthop_info(skb, nhc, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; + } #endif } else { if (fib_add_multipath(skb, fi) < 0) @@ -1709,7 +1870,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh, * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ -static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) +void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; @@ -1745,7 +1906,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev) - nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1754,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + * + * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { @@ -1835,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + struct fib_nh *nh; if (fa->fa_slen != slen) continue; @@ -1856,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].fib_nh_gw4 || - next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) + + nh = fib_info_nh(next_fi, 0); + if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1899,6 +2064,8 @@ out: /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. + * + * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { @@ -1993,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; + if (unlikely(res->fi->nh)) { + nexthop_path_fib_result(res, hash); + return; + } + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { if (!fib_good_nh(nexthop_nh)) @@ -2021,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res, goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1) { + if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 868c74771fa9..4400f5051977 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -338,12 +338,18 @@ static struct tnode *tnode_alloc(int bits) static inline void empty_child_inc(struct key_vector *n) { - ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; + tn_info(n)->empty_children++; + + if (!tn_info(n)->empty_children) + tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { - tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; + if (!tn_info(n)->empty_children) + tn_info(n)->full_children--; + + tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) @@ -1449,6 +1455,7 @@ found: fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { +out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif @@ -1457,7 +1464,13 @@ found: } if (fi->fib_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + + if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; + } + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); if (nhc->nhc_flags & RTNH_F_DEAD) @@ -1931,6 +1944,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) return found; } +/* derived from fib_trie_free */ +static void __fib_info_notify_update(struct net *net, struct fib_table *tb, + struct nl_info *info) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct fib_alias *fa; + + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + pn = node_parent(pn); + cindex = get_index(pkey, pn); + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) + continue; + + rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, + info, NLM_F_REPLACE); + + /* call_fib_entry_notifiers will be removed when + * in-kernel notifier is implemented and supported + * for nexthop objects + */ + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + n->key, + KEYLENGTH - fa->fa_slen, fa, + NULL); + } + } +} + +void fib_info_notify_update(struct net *net, struct nl_info *info) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + __fib_info_notify_update(net, tb, info); + } +} + static void fib_leaf_notify(struct net *net, struct key_vector *l, struct fib_table *tb, struct notifier_block *nb) { @@ -2006,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); + int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; - int i, s_i; - if (filter->filter_set) + if (filter->filter_set || + !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; + s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - int err; + struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; + i_fa = 0; + if (tb->tb_id != fa->tb_id) goto next; @@ -2030,29 +2118,43 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, goto next; if ((filter->protocol && - fa->fa_info->fib_protocol != filter->protocol)) + fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && - !fib_info_nh_uses_dev(fa->fa_info, filter->dev)) + !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } - err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, KEYLENGTH - fa->fa_slen, - fa->fa_tos, fa->fa_info, flags); - if (err < 0) { - cb->args[4] = i; - return err; + if (filter->dump_routes && !s_fa) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fi, flags); + if (err < 0) + goto stop; + i_fa++; } + + if (filter->dump_exceptions) { + err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, + &i_fa, s_fa); + if (err < 0) + goto stop; + } + next: i++; } cb->args[4] = i; return skb->len; + +stop: + cb->args[4] = i; + cb->args[5] = i_fa; + return err; } /* rcu_read_lock needs to be hold by caller from readside */ @@ -2634,14 +2736,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->fib_nh_gw4) - flags |= RTF_GATEWAY; + if (fi) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + + if (nhc->nhc_gw.ipv4) + flags |= RTF_GATEWAY; + } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; @@ -2672,7 +2778,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - const struct fib_info *fi = fa->fa_info; + struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); @@ -2685,26 +2791,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_setwidth(seq, 127); - if (fi) + if (fi) { + struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + __be32 gw = 0; + + if (nhc->nhc_gw_family == AF_INET) + gw = nhc->nhc_gw.ipv4; + seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->fib_nh_gw4, flags, 0, 0, + nhc->nhc_dev ? nhc->nhc_dev->name : "*", + prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); - else + } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); - + } seq_pad(seq, '\n'); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7c857c72aad1..1510e951f451 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -201,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return *this_cpu_ptr(net->ipv4.icmp_sk); + return this_cpu_read(*net->ipv4.icmp_sk); } /* Called with BH disabled */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a57f0d69eadb..9a206931a342 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -332,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; - } endfor_ifa(in_dev); + } return htonl(INADDR_ANY); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7fd6db3fe366..f5c163d4771b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -649,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) EXPORT_SYMBOL(inet_rtx_syn_ack); /* return true if req was found in the ehash table */ -static bool reqsk_queue_unlink(struct request_sock_queue *queue, - struct request_sock *req) +static bool reqsk_queue_unlink(struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; @@ -669,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { - if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + if (reqsk_queue_unlink(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ce6969896f5..d666756be5f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -106,48 +106,90 @@ int inet_frags_init(struct inet_frags *f) if (!f->frags_cachep) return -ENOMEM; + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - /* We must wait that all inet_frag_destroy_rcu() have completed. */ - rcu_barrier(); + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; + int count; - /* If we can not cancel the timer, it means this frag_queue - * is already disappearing, we have nothing to do. - * Otherwise, we own a refcount until the end of this function. - */ - if (!del_timer(&fq->timer)) - return; + count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; - refcount_dec(&fq->refcnt); + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } spin_unlock_bh(&fq->lock); - inet_frag_put(fq); + if (refcount_sub_and_test(count, &fq->refcnt)) + inet_frag_destroy(fq); +} + +static void fqdir_work_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + struct inet_frags *f = fqdir->f; + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); + + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + kfree(fqdir); } -void inet_frags_exit_net(struct netns_frags *nf) +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { - nf->high_thresh = 0; /* prevent creation of new frags */ + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; - rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); + if (!fqdir) + return -ENOMEM; + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; + return 0; } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_init); + +void fqdir_exit(struct fqdir *fqdir) +{ + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(system_wq, &fqdir->destroy_work); +} +EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -155,11 +197,23 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - struct netns_frags *nf = fq->net; + struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; - rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); - refcount_dec(&fq->refcnt); + rcu_read_lock(); + /* The RCU read lock provides a memory barrier + * guaranteeing that if fqdir->dead is false then + * the hash table destruction will not start until + * after we unlock. Paired with inet_frags_exit_net(). + */ + if (!fqdir->dead) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + refcount_dec(&fq->refcnt); + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); @@ -168,7 +222,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); - struct inet_frags *f = q->net->f; + struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); @@ -199,7 +253,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { - struct netns_frags *nf; + struct fqdir *fqdir; unsigned int sum, sum_truesize = 0; struct inet_frags *f; @@ -207,18 +261,18 @@ void inet_frag_destroy(struct inet_frag_queue *q) WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ - nf = q->net; - f = nf->f; + fqdir = q->fqdir; + f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); - sub_frag_mem_limit(nf, sum); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { @@ -228,9 +282,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (!q) return NULL; - q->net = nf; + q->fqdir = fqdir; f->constructor(q, arg); - add_frag_mem_limit(nf, f->qsize); + add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); @@ -239,21 +293,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, return q; } -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { - struct inet_frags *f = nf->f; + struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; - q = inet_frag_alloc(nf, f, arg); + q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } - mod_timer(&q->timer, jiffies + nf->timeout); + mod_timer(&q->timer, jiffies + fqdir->timeout); - *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; @@ -265,18 +319,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { struct inet_frag_queue *fq = NULL, *prev; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh) return NULL; rcu_read_lock(); - prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) - fq = inet_frag_create(nf, key, &prev); + fq = inet_frag_create(fqdir, key, &prev); if (prev && !IS_ERR(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) @@ -387,7 +441,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, delta += head->truesize; if (delta) - add_frag_mem_limit(q->net, delta); + add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part @@ -409,7 +463,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(q->net, clone->truesize); + add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { @@ -462,7 +516,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); *nextp = NULL; skb_mark_not_on_list(head); @@ -490,7 +544,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) if (head == q->fragments_tail) q->fragments_tail = NULL; - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); return head; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c4503073248b..97824864e40d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -316,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cf2b0a6a3337..4385eb9e781f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, - frags); - struct net *net = container_of(ipv4, struct net, ipv4); + struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->net->max_dist ? + qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -142,9 +140,13 @@ static void ip_expire(struct timer_list *t) int err; qp = container_of(frag, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); + net = qp->q.fqdir->net; rcu_read_lock(); + + if (qp->q.fqdir->dead) + goto out_rcu_unlock; + spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -211,7 +213,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->ipv4.frags, &key); + q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; @@ -222,7 +224,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = qp->q.net->max_dist; + unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; @@ -236,12 +238,8 @@ static int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments_tail && (end - start) > max; - if (rc) { - struct net *net; - - net = container_of(qp->q.net, struct net, ipv4.frags); - __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - } + if (rc) + __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } @@ -250,13 +248,13 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); + sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -273,7 +271,7 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; @@ -352,7 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); + add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; @@ -399,7 +397,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; @@ -544,30 +542,24 @@ static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", - .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", - .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", - .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", - .data = &init_net.ipv4.frags.max_dist, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -600,13 +592,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv4.frags.high_thresh; - table[0].extra1 = &net->ipv4.frags.low_thresh; - table[1].data = &net->ipv4.frags.low_thresh; - table[1].extra2 = &net->ipv4.frags.high_thresh; - table[2].data = &net->ipv4.frags.timeout; - table[3].data = &net->ipv4.frags.max_dist; } + table[0].data = &net->ipv4.fqdir->high_thresh; + table[0].extra1 = &net->ipv4.fqdir->low_thresh; + table[1].data = &net->ipv4.fqdir->low_thresh; + table[1].extra2 = &net->ipv4.fqdir->high_thresh; + table[2].data = &net->ipv4.fqdir->timeout; + table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) @@ -654,6 +646,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) { int res; + res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); + if (res < 0) + return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -668,36 +663,38 @@ static int __net_init ipv4_frags_init_net(struct net *net) * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 4 * 1024 * 1024; - net->ipv4.frags.low_thresh = 3 * 1024 * 1024; + net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; + net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ - net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.fqdir->timeout = IP_FRAG_TIME; - net->ipv4.frags.max_dist = 64; - net->ipv4.frags.f = &ip4_frags; + net->ipv4.fqdir->max_dist = 64; - res = inet_frags_init_net(&net->ipv4.frags); - if (res < 0) - return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); return res; } +static void __net_exit ipv4_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv4.fqdir); +} + static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { - .init = ipv4_frags_init_net, - .exit = ipv4_frags_exit_net, + .init = ipv4_frags_init_net, + .pre_exit = ipv4_frags_pre_exit_net, + .exit = ipv4_frags_exit_net, }; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..ddaa01ec2bce 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -473,6 +473,7 @@ error: *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8c2ec35b6512..cc7ef0d05bbd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -315,14 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *new_rt; - int ret; + bool do_cn = false; + int ret, err; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_CN: + do_cn = true; + /* fall through */ + case NET_XMIT_SUCCESS: + break; + default: kfree_skb(skb); return ret; } @@ -338,7 +354,8 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk, skb_dst_set(skb, &new_rt->dst); } - return dev_loopback_xmit(net, sk, skb); + err = dev_loopback_xmit(net, sk, skb); + return (do_cn && err) ? ret : err; } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -537,9 +554,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_hash(to, from); - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(from)->flags; - #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif @@ -573,6 +587,175 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return ip_do_fragment(net, sk, skb, output); } +void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, + unsigned int hlen, struct ip_fraglist_iter *iter) +{ + unsigned int first_len = skb_pagelen(skb); + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->iph = iph; + iter->hlen = hlen; + + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_init); + +static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, + struct ip_fraglist_iter *iter) +{ + struct sk_buff *to = iter->frag; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(skb)->flags; + + if (iter->offset == 0) + ip_options_fragment(to); +} + +void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) +{ + unsigned int hlen = iter->hlen; + struct iphdr *iph = iter->iph; + struct sk_buff *frag; + + frag = iter->frag; + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iter->iph = ip_hdr(frag); + iph = iter->iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + iter->offset += skb->len - hlen; + iph->frag_off = htons(iter->offset >> 3); + if (frag->next) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_prepare); + +void ip_frag_init(struct sk_buff *skb, unsigned int hlen, + unsigned int ll_rs, unsigned int mtu, + struct ip_frag_state *state) +{ + struct iphdr *iph = ip_hdr(skb); + + state->hlen = hlen; + state->ll_rs = ll_rs; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + state->not_last_frag = iph->frag_off & htons(IP_MF); +} +EXPORT_SYMBOL(ip_frag_init); + +static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, + bool first_frag, struct ip_frag_state *state) +{ + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + + if (IPCB(from)->flags & IPSKB_FRAG_PMTU) + state->iph->frag_off |= htons(IP_DF); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (first_frag) + ip_options_fragment(from); +} + +struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) +{ + unsigned int len = state->left; + struct sk_buff *skb2; + struct iphdr *iph; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) { + len &= ~7; + } + + /* Allocate buffer */ + skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); + if (!skb2) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, state->ll_rs); + skb_put(skb2, len + state->hlen); + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + state->hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) + BUG(); + state->left -= len; + + /* + * Fill in the new header fields. + */ + iph = ip_hdr(skb2); + iph->frag_off = htons((state->offset >> 3)); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (state->left > 0 || state->not_last_frag) + iph->frag_off |= htons(IP_MF); + state->ptr += len; + state->offset += len; + + iph->tot_len = htons(len + state->hlen); + + ip_send_check(iph); + + return skb2; +} +EXPORT_SYMBOL(ip_frag_next); + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -584,12 +767,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; - int ptr; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; - int offset; - __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); + unsigned int mtu, hlen, ll_rs; + struct ip_fraglist_iter iter; + struct ip_frag_state state; int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ @@ -654,49 +836,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } /* Everything is OK. Generate! */ - - err = 0; - offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - iph->tot_len = htons(first_len); - iph->frag_off = htons(IP_MF); - ip_send_check(iph); + ip_fraglist_init(skb, iph, hlen, &iter); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), iph, hlen); - iph = ip_hdr(frag); - iph->tot_len = htons(frag->len); - ip_copy_metadata(frag, skb); - if (offset == 0) - ip_options_fragment(frag); - offset += skb->len - hlen; - iph->frag_off = htons(offset>>3); - if (frag->next) - iph->frag_off |= htons(IP_MF); - /* Ready, complete checksum */ - ip_send_check(iph); + if (iter.frag) { + ip_fraglist_ipcb_prepare(skb, &iter); + ip_fraglist_prepare(skb, &iter); } err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip_fraglist_next(&iter); } if (err == 0) { @@ -704,7 +861,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; @@ -720,105 +877,29 @@ slow_path_clean: } slow_path: - iph = ip_hdr(skb); - - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; - not_last_frag = iph->frag_off & htons(IP_MF); + ip_frag_init(skb, hlen, ll_rs, mtu, &state); /* * Keep copying data until we run out. */ - while (left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } + while (state.left > 0) { + bool first_frag = (state.offset == 0); - /* Allocate buffer */ - skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); - if (!skb2) { - err = -ENOMEM; + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); goto fail; } - - /* - * Set up data on packet - */ - - ip_copy_metadata(skb2, skb); - skb_reserve(skb2, ll_rs); - skb_put(skb2, len + hlen); - skb_reset_network_header(skb2); - skb2->transport_header = skb2->network_header + hlen; - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - - skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); - - /* - * Copy a block of the IP datagram. - */ - if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) - BUG(); - left -= len; - - /* - * Fill in the new header fields. - */ - iph = ip_hdr(skb2); - iph->frag_off = htons((offset >> 3)); - - if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) - iph->frag_off |= htons(IP_DF); - - /* ANK: dirty, but effective trick. Upgrade options only if - * the segment to be fragmented was THE FIRST (otherwise, - * options are already fixed) and make it ONCE - * on the initial skb, so that all the following fragments - * will inherit fixed options. - */ - if (offset == 0) - ip_options_fragment(skb); - - /* - * Added AC : If we are fragmenting a fragment that's not the - * last fragment then keep MF on each bit - */ - if (left > 0 || not_last_frag) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; + ip_frag_ipcb(skb, skb2, first_frag, &state); /* * Put this fragment into the sending queue. */ - iph->tot_len = htons(len + hlen); - - ip_send_check(iph); - err = output(net, sk, skb2); if (err) goto fail; @@ -1568,7 +1649,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len) + unsigned int len, u64 transmit_time) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1584,6 +1665,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipcm_init(&ipc); ipc.addr = daddr; + ipc.sockc.transmit_time = transmit_time; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 87ca2c42359b..a4e07e5e9c11 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par) unsigned char *arpptr; int pln, hln; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 5f116c3749b4..5930d3b02555 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -29,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; iph = ip_hdr(skb); oldtos = iph->tos; @@ -58,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr)) return true; - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) return false; tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 64d9563c0218..8e7f84ec783d 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -3,258 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct iphdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, - __be32 daddr) -{ - struct iphdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - iph->version = 4; - iph->ihl = sizeof(*iph) / 4; - iph->tos = 0; - iph->id = 0; - iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; - iph->protocol = IPPROTO_TCP; - iph->check = 0; - iph->saddr = saddr; - iph->daddr = daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct iphdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - skb_dst_set_noref(nskb, skb_dst(skb)); - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) - goto free_nskb; - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) @@ -306,135 +59,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - unsigned int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb) || - ip_hdr(skb)->protocol != IPPROTO_TCP) - return NF_ACCEPT; - - thoff = ip_hdrlen(skb); - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv4_synproxy_ops[] = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -449,16 +73,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref4 == 0) { - err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv4_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref4++; return err; } @@ -466,10 +86,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref4--; - if (snet->hook_ref4 == 0) - nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); + nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6eefde5bc468..69697eb4bfc6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -2,7 +2,7 @@ /* * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index dfea10f13878..87b711fd5a44 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> */ #include <linux/module.h> @@ -58,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } - /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index 657d2dcec3cc..717b726504fe 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b6dd39636bea..b2bae0b0e42a 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { + const struct in_ifaddr *ifa; struct in_device *indev; __be32 laddr; @@ -57,10 +58,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { + + in_dev_for_each_ifa_rcu(ifa, indev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + laddr = ifa->ifa_local; break; - } endfor_ifa(indev); + } return laddr ? laddr : daddr; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c new file mode 100644 index 000000000000..5fe5a3981d43 --- /dev/null +++ b/net/ipv4/nexthop.c @@ -0,0 +1,1828 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Generic nexthop implementation + * + * Copyright (c) 2017-19 Cumulus Networks + * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> + */ + +#include <linux/nexthop.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <net/arp.h> +#include <net/ipv6_stubs.h> +#include <net/lwtunnel.h> +#include <net/ndisc.h> +#include <net/nexthop.h> +#include <net/route.h> +#include <net/sock.h> + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo); + +#define NH_DEV_HASHBITS 8 +#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) + +static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { + [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, + [NHA_ID] = { .type = NLA_U32 }, + [NHA_GROUP] = { .type = NLA_BINARY }, + [NHA_GROUP_TYPE] = { .type = NLA_U16 }, + [NHA_BLACKHOLE] = { .type = NLA_FLAG }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_GATEWAY] = { .type = NLA_BINARY }, + [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, + [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_GROUPS] = { .type = NLA_FLAG }, + [NHA_MASTER] = { .type = NLA_U32 }, +}; + +static unsigned int nh_dev_hashfn(unsigned int val) +{ + unsigned int mask = NH_DEV_HASHSIZE - 1; + + return (val ^ + (val >> NH_DEV_HASHBITS) ^ + (val >> (NH_DEV_HASHBITS * 2))) & mask; +} + +static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) +{ + struct net_device *dev = nhi->fib_nhc.nhc_dev; + struct hlist_head *head; + unsigned int hash; + + WARN_ON(!dev); + + hash = nh_dev_hashfn(dev->ifindex); + head = &net->nexthop.devhash[hash]; + hlist_add_head(&nhi->dev_hash, head); +} + +static void nexthop_free_mpath(struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) + WARN_ON(nhg->nh_entries[i].nh); + + kfree(nhg); +} + +static void nexthop_free_single(struct nexthop *nh) +{ + struct nh_info *nhi; + + nhi = rcu_dereference_raw(nh->nh_info); + switch (nhi->family) { + case AF_INET: + fib_nh_release(nh->net, &nhi->fib_nh); + break; + case AF_INET6: + ipv6_stub->fib6_nh_release(&nhi->fib6_nh); + break; + } + kfree(nhi); +} + +void nexthop_free_rcu(struct rcu_head *head) +{ + struct nexthop *nh = container_of(head, struct nexthop, rcu); + + if (nh->is_group) + nexthop_free_mpath(nh); + else + nexthop_free_single(nh); + + kfree(nh); +} +EXPORT_SYMBOL_GPL(nexthop_free_rcu); + +static struct nexthop *nexthop_alloc(void) +{ + struct nexthop *nh; + + nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + if (nh) { + INIT_LIST_HEAD(&nh->fi_list); + INIT_LIST_HEAD(&nh->f6i_list); + INIT_LIST_HEAD(&nh->grp_list); + } + return nh; +} + +static struct nh_group *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nhg; + + nhg = kzalloc(sz, GFP_KERNEL); + if (nhg) + nhg->num_nh = num_nh; + + return nhg; +} + +static void nh_base_seq_inc(struct net *net) +{ + while (++net->nexthop.seq == 0) + ; +} + +/* no reference taken; rcu lock or rtnl must be held */ +struct nexthop *nexthop_find_by_id(struct net *net, u32 id) +{ + struct rb_node **pp, *parent = NULL, *next; + + pp = &net->nexthop.rb_root.rb_node; + while (1) { + struct nexthop *nh; + + next = rcu_dereference_raw(*pp); + if (!next) + break; + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (id < nh->id) + pp = &next->rb_left; + else if (id > nh->id) + pp = &next->rb_right; + else + return nh; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nexthop_find_by_id); + +/* used for auto id allocation; called with rtnl held */ +static u32 nh_find_unused_id(struct net *net) +{ + u32 id_start = net->nexthop.last_id_allocated; + + while (1) { + net->nexthop.last_id_allocated++; + if (net->nexthop.last_id_allocated == id_start) + break; + + if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) + return net->nexthop.last_id_allocated; + } + return 0; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nexthop_grp *p; + size_t len = nhg->num_nh * sizeof(*p); + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nhg->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nhg->num_nh; ++i) { + p->id = nhg->nh_entries[i].nh->id; + p->weight = nhg->nh_entries[i].weight - 1; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, + int event, u32 portid, u32 seq, unsigned int nlflags) +{ + struct fib6_nh *fib6_nh; + struct fib_nh *fib_nh; + struct nlmsghdr *nlh; + struct nh_info *nhi; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = nh->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nhg)) + goto nla_put_failure; + goto out; + } + + nhi = rtnl_dereference(nh->nh_info); + nhm->nh_family = nhi->family; + if (nhi->reject_nh) { + if (nla_put_flag(skb, NHA_BLACKHOLE)) + goto nla_put_failure; + goto out; + } else { + const struct net_device *dev; + + dev = nhi->fib_nhc.nhc_dev; + if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) + goto nla_put_failure; + } + + nhm->nh_scope = nhi->fib_nhc.nhc_scope; + switch (nhi->family) { + case AF_INET: + fib_nh = &nhi->fib_nh; + if (fib_nh->fib_nh_gw_family && + nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + goto nla_put_failure; + break; + + case AF_INET6: + fib6_nh = &nhi->fib6_nh; + if (fib6_nh->fib_nh_gw_family && + nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) + goto nla_put_failure; + break; + } + + if (nhi->fib_nhc.nhc_lwtstate && + lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, + NHA_ENCAP, NHA_ENCAP_TYPE) < 0) + goto nla_put_failure; + +out: + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t nh_nlmsg_size_grp(struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + + return nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ +} + +static size_t nh_nlmsg_size_single(struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + size_t sz; + + /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE + * are mutually exclusive + */ + sz = nla_total_size(4); /* NHA_OIF */ + + switch (nhi->family) { + case AF_INET: + if (nhi->fib_nh.fib_nh_gw_family) + sz += nla_total_size(4); /* NHA_GATEWAY */ + break; + + case AF_INET6: + /* NHA_GATEWAY */ + if (nhi->fib6_nh.fib_nh_gw_family) + sz += nla_total_size(sizeof(const struct in6_addr)); + break; + } + + if (nhi->fib_nhc.nhc_lwtstate) { + sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); + sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ + } + + return sz; +} + +static size_t nh_nlmsg_size(struct nexthop *nh) +{ + size_t sz = nla_total_size(4); /* NHA_ID */ + + if (nh->is_group) + sz += nh_nlmsg_size_grp(nh); + else + sz += nh_nlmsg_size_single(nh); + + return sz; +} + +static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) +{ + unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); + if (!skb) + goto errout; + + err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); + if (err < 0) { + /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); +} + +static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, + struct netlink_ext_ack *extack) +{ + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + /* nested multipath (group within a group) is not + * supported + */ + if (nhg->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh && npaths > 1) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group with more than 1 path"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + unsigned int i, j; + + if (len & (sizeof(struct nexthop_grp) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + if (nhg[i].resvd1 || nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + return -EINVAL; + } + if (nhg[i].weight > 254) { + NL_SET_ERR_MSG(extack, "Invalid value for weight"); + return -EINVAL; + } + for (j = i + 1; j < len; ++j) { + if (nhg[i].id == nhg[j].id) { + NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); + return -EINVAL; + } + } + } + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg[i].id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, len, extack)) + return -EINVAL; + } + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static bool ipv6_good_nh(const struct fib6_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +static bool ipv4_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nexthop *rc = NULL; + struct nh_group *nhg; + int i; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + nhi = rcu_dereference(nhge->nh->nh_info); + switch (nhi->family) { + case AF_INET: + if (ipv4_good_nh(&nhi->fib_nh)) + return nhge->nh; + break; + case AF_INET6: + if (ipv6_good_nh(&nhi->fib6_nh)) + return nhge->nh; + break; + } + + if (!rc) + rc = nhge->nh; + } + + return rc; +} +EXPORT_SYMBOL_GPL(nexthop_select_path); + +int nexthop_for_each_fib6_nh(struct nexthop *nh, + int (*cb)(struct fib6_nh *nh, void *arg), + void *arg) +{ + struct nh_info *nhi; + int err; + + if (nh->is_group) { + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_rtnl(nh->nh_grp); + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + nhi = rcu_dereference_rtnl(nhge->nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + } else { + nhi = rcu_dereference_rtnl(nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); + +static int check_src_addr(const struct in6_addr *saddr, + struct netlink_ext_ack *extack) +{ + if (!ipv6_addr_any(saddr)) { + NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); + return -EINVAL; + } + return 0; +} + +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + /* fib6_src is unique to a fib6_info and limits the ability to cache + * routes in fib6_nh within a nexthop that is potentially shared + * across multiple fib entries. If the config wants to use source + * routing it can not use nexthop objects. mlxsw also does not allow + * fib6_src on routes. + */ + if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) + return -EINVAL; + + if (nh->is_group) { + struct nh_group *nhg; + + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->has_v4) + goto no_v4_nh; + } else { + nhi = rtnl_dereference(nh->nh_info); + if (nhi->family == AF_INET) + goto no_v4_nh; + } + + return 0; +no_v4_nh: + NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(fib6_check_nexthop); + +/* if existing nexthop has ipv6 routes linked to it, need + * to verify this new spec works with ipv6 + */ +static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib6_info *f6i; + + if (list_empty(&old->f6i_list)) + return 0; + + list_for_each_entry(f6i, &old->f6i_list, nh_list) { + if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) + return -EINVAL; + } + + return fib6_check_nexthop(new, NULL, extack); +} + +static int nexthop_check_scope(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); + return -EINVAL; + } + + if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); + return -EINVAL; + } + + return 0; +} + +/* Invoked by fib add code to verify nexthop by id is ok with + * config for prefix; parts of fib_check_nh not done when nexthop + * object is used. + */ +int fib_check_nexthop(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + int err = 0; + + if (nh->is_group) { + struct nh_group *nhg; + + if (scope == RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); + err = -EINVAL; + goto out; + } + + nhg = rtnl_dereference(nh->nh_grp); + /* all nexthops in a group have the same scope */ + err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + } else { + err = nexthop_check_scope(nh, scope, extack); + } +out: + return err; +} + +static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib_info *fi; + + list_for_each_entry(fi, &old->fi_list, nh_list) { + int err; + + err = fib_check_nexthop(new, fi->fib_scope, extack); + if (err) + return err; + } + return 0; +} + +static void nh_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, + struct nh_group *nhg, + struct nl_info *nlinfo) +{ + struct nexthop *nh = nhge->nh; + struct nh_grp_entry *nhges; + bool found = false; + int i; + + WARN_ON(!nh); + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; ++i) { + if (found) { + nhges[i-1].nh = nhges[i].nh; + nhges[i-1].weight = nhges[i].weight; + list_del(&nhges[i].nh_list); + list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); + } else if (nhg->nh_entries[i].nh == nh) { + found = true; + } + } + + if (WARN_ON(!found)) + return; + + nhg->num_nh--; + nhg->nh_entries[nhg->num_nh].nh = NULL; + + nh_group_rebalance(nhg); + + nexthop_put(nh); + + if (nlinfo) + nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nhg; + + list_del(&nhge->nh_list); + nhg = rtnl_dereference(nhge->nh_parent->nh_grp); + remove_nh_grp_entry(nhge, nhg, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhge->nh_parent, nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) +{ + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + int i, num_nh = nhg->num_nh; + + for (i = 0; i < num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (WARN_ON(!nhge->nh)) + continue; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + nhg->num_nh--; + } +} + +/* not called for nexthop replace */ +static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i, *tmp; + bool do_flush = false; + struct fib_info *fi; + + list_for_each_entry(fi, &nh->fi_list, nh_list) { + fi->fib_flags |= RTNH_F_DEAD; + do_flush = true; + } + if (do_flush) + fib_flush(net); + + /* ip6_del_rt removes the entry from this list hence the _safe */ + list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + /* __ip6_del_rt does a release, so do a hold here */ + fib6_info_hold(f6i); + ipv6_stub->ip6_del_rt(net, f6i); + } +} + +static void __remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + __remove_nexthop_fib(net, nh); + + if (nh->is_group) { + remove_nexthop_group(nh, nlinfo); + } else { + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fib_nhc.nhc_dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, nlinfo); + } +} + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + /* remove from the tree */ + rb_erase(&nh->rb_node, &net->nexthop.rb_root); + + if (nlinfo) + nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); + + __remove_nexthop(net, nh, nlinfo); + nh_base_seq_inc(net); + + nexthop_put(nh); +} + +/* if any FIB entries reference this nexthop, any dst entries + * need to be regenerated + */ +static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) + rt_cache_flush(net); + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_update_sernum(net, f6i); +} + +static int replace_nexthop_grp(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_group *oldg, *newg; + int i; + + if (!new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); + return -EINVAL; + } + + oldg = rtnl_dereference(old->nh_grp); + newg = rtnl_dereference(new->nh_grp); + + /* update parents - used by nexthop code for cleanup */ + for (i = 0; i < newg->num_nh; i++) + newg->nh_entries[i].nh_parent = old; + + rcu_assign_pointer(old->nh_grp, newg); + + for (i = 0; i < oldg->num_nh; i++) + oldg->nh_entries[i].nh_parent = new; + + rcu_assign_pointer(new->nh_grp, oldg); + + return 0; +} + +static int replace_nexthop_single(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi, *newi; + + if (new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); + return -EINVAL; + } + + oldi = rtnl_dereference(old->nh_info); + newi = rtnl_dereference(new->nh_info); + + newi->nh_parent = old; + oldi->nh_parent = new; + + old->protocol = new->protocol; + old->nh_flags = new->nh_flags; + + rcu_assign_pointer(old->nh_info, newi); + rcu_assign_pointer(new->nh_info, oldi); + + return 0; +} + +static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) { + struct fib_info *fi; + + /* expectation is a few fib_info per nexthop and then + * a lot of routes per fib_info. So mark the fib_info + * and then walk the fib tables once + */ + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = true; + + fib_info_notify_update(net, info); + + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = false; + } + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_rt_update(net, f6i, info); +} + +/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries + * linked to this nexthop and for all groups that the nexthop + * is a member of + */ +static void nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct nh_grp_entry *nhge; + + __nexthop_replace_notify(net, nh, info); + + list_for_each_entry(nhge, &nh->grp_list, nh_list) + __nexthop_replace_notify(net, nhge->nh_parent, info); +} + +static int replace_nexthop(struct net *net, struct nexthop *old, + struct nexthop *new, struct netlink_ext_ack *extack) +{ + bool new_is_reject = false; + struct nh_grp_entry *nhge; + int err; + + /* check that existing FIB entries are ok with the + * new nexthop definition + */ + err = fib_check_nh_list(old, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(old, new, extack); + if (err) + return err; + + if (!new->is_group) { + struct nh_info *nhi = rtnl_dereference(new->nh_info); + + new_is_reject = nhi->reject_nh; + } + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + /* if new nexthop is a blackhole, any groups using this + * nexthop cannot have more than 1 path + */ + if (new_is_reject && + nexthop_num_path(nhge->nh_parent) > 1) { + NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); + return -EINVAL; + } + + err = fib_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + } + + if (old->is_group) + err = replace_nexthop_grp(net, old, new, extack); + else + err = replace_nexthop_single(net, old, new, extack); + + if (!err) { + nh_rt_cache_flush(net, old); + + __remove_nexthop(net, new, NULL); + nexthop_put(new); + } + + return err; +} + +/* called with rtnl_lock held */ +static int insert_nexthop(struct net *net, struct nexthop *new_nh, + struct nh_config *cfg, struct netlink_ext_ack *extack) +{ + struct rb_node **pp, *parent = NULL, *next; + struct rb_root *root = &net->nexthop.rb_root; + bool replace = !!(cfg->nlflags & NLM_F_REPLACE); + bool create = !!(cfg->nlflags & NLM_F_CREATE); + u32 new_id = new_nh->id; + int replace_notify = 0; + int rc = -EEXIST; + + pp = &root->rb_node; + while (1) { + struct nexthop *nh; + + next = rtnl_dereference(*pp); + if (!next) + break; + + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (new_id < nh->id) { + pp = &next->rb_left; + } else if (new_id > nh->id) { + pp = &next->rb_right; + } else if (replace) { + rc = replace_nexthop(net, nh, new_nh, extack); + if (!rc) { + new_nh = nh; /* send notification with old nh */ + replace_notify = 1; + } + goto out; + } else { + /* id already exists and not a replace */ + goto out; + } + } + + if (replace && !create) { + NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); + rc = -ENOENT; + goto out; + } + + rb_link_node_rcu(&new_nh->rb_node, parent, pp); + rb_insert_color(&new_nh->rb_node, root); + rc = 0; +out: + if (!rc) { + nh_base_seq_inc(net); + nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); + if (replace_notify) + nexthop_replace_notify(net, new_nh, &cfg->nlinfo); + } + + return rc; +} + +/* rtnl */ +/* remove all nexthops tied to a device being deleted */ +static void nexthop_flush_dev(struct net_device *dev) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev != dev) + continue; + + remove_nexthop(net, nhi->nh_parent, NULL); + } +} + +/* rtnl; called when net namespace is deleted */ +static void flush_all_nexthops(struct net *net) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + struct nexthop *nh; + + while ((node = rb_first(root))) { + nh = rb_entry(node, struct nexthop, rb_node); + remove_nexthop(net, nh, NULL); + cond_resched(); + } +} + +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nhg; + struct nexthop *nh; + int i; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh->is_group = 1; + + nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nhg) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nhg->num_nh; ++i) { + struct nexthop *nhe; + struct nh_info *nhi; + + nhe = nexthop_find_by_id(net, entry[i].id); + if (!nexthop_get(nhe)) + goto out_no_nh; + + nhi = rtnl_dereference(nhe->nh_info); + if (nhi->family == AF_INET) + nhg->has_v4 = true; + + nhg->nh_entries[i].nh = nhe; + nhg->nh_entries[i].weight = entry[i].weight + 1; + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); + nhg->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->mpath = 1; + nh_group_rebalance(nhg); + } + + rcu_assign_pointer(nh->nh_grp, nhg); + + return nh; + +out_no_nh: + for (; i >= 0; --i) + nexthop_put(nhg->nh_entries[i].nh); + + kfree(nhg); + kfree(nh); + + return ERR_PTR(-ENOENT); +} + +static int nh_create_ipv4(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib_nh *fib_nh = &nhi->fib_nh; + struct fib_config fib_cfg = { + .fc_oif = cfg->nh_ifindex, + .fc_gw4 = cfg->gw.ipv4, + .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + u32 tb_id = l3mdev_fib_table(cfg->dev); + int err = -EINVAL; + + err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); + if (err) { + fib_nh_release(net, fib_nh); + goto out; + } + + /* sets nh_dev if successful */ + err = fib_check_nh(net, fib_nh, tb_id, 0, extack); + if (!err) { + nh->nh_flags = fib_nh->fib_nh_flags; + fib_info_update_nhc_saddr(net, &fib_nh->nh_common, + fib_nh->fib_nh_scope); + } else { + fib_nh_release(net, fib_nh); + } +out: + return err; +} + +static int nh_create_ipv6(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib6_nh *fib6_nh = &nhi->fib6_nh; + struct fib6_config fib6_cfg = { + .fc_table = l3mdev_fib_table(cfg->dev), + .fc_ifindex = cfg->nh_ifindex, + .fc_gateway = cfg->gw.ipv6, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + int err; + + if (!ipv6_addr_any(&cfg->gw.ipv6)) + fib6_cfg.fc_flags |= RTF_GATEWAY; + + /* sets nh_dev if successful */ + err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, + extack); + if (err) + ipv6_stub->fib6_nh_release(fib6_nh); + else + nh->nh_flags = fib6_nh->fib_nh_flags; + + return err; +} + +static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + struct nexthop *nh; + int err = 0; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); + if (!nhi) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + nh->nh_flags = cfg->nh_flags; + nh->net = net; + + nhi->nh_parent = nh; + nhi->family = cfg->nh_family; + nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + + if (cfg->nh_blackhole) { + nhi->reject_nh = 1; + cfg->nh_ifindex = net->loopback_dev->ifindex; + } + + switch (cfg->nh_family) { + case AF_INET: + err = nh_create_ipv4(net, nh, nhi, cfg, extack); + break; + case AF_INET6: + err = nh_create_ipv6(net, nh, nhi, cfg, extack); + break; + } + + if (err) { + kfree(nhi); + kfree(nh); + return ERR_PTR(err); + } + + /* add the entry to the device based hash */ + nexthop_devhash_add(net, nhi); + + rcu_assign_pointer(nh->nh_info, nhi); + + return nh; +} + +/* called with rtnl lock held */ +static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nexthop *nh; + int err; + + if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + return ERR_PTR(-EINVAL); + } + + if (!cfg->nh_id) { + cfg->nh_id = nh_find_unused_id(net); + if (!cfg->nh_id) { + NL_SET_ERR_MSG(extack, "No unused id"); + return ERR_PTR(-EINVAL); + } + } + + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + + if (IS_ERR(nh)) + return nh; + + refcount_set(&nh->refcnt, 1); + nh->id = cfg->nh_id; + nh->protocol = cfg->nh_protocol; + nh->net = net; + + err = insert_nexthop(net, nh, cfg, extack); + if (err) { + __remove_nexthop(net, nh, NULL); + nexthop_put(nh); + nh = ERR_PTR(err); + } + + return nh; +} + +static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + if (nhm->resvd || nhm->nh_scope) { + NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); + goto out; + } + if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { + NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); + goto out; + } + + switch (nhm->nh_family) { + case AF_INET: + case AF_INET6: + break; + case AF_UNSPEC: + if (tb[NHA_GROUP]) + break; + /* fallthrough */ + default: + NL_SET_ERR_MSG(extack, "Invalid address family"); + goto out; + } + + if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { + NL_SET_ERR_MSG(extack, "Invalid attributes in request"); + goto out; + } + + memset(cfg, 0, sizeof(*cfg)); + cfg->nlflags = nlh->nlmsg_flags; + cfg->nlinfo.portid = NETLINK_CB(skb).portid; + cfg->nlinfo.nlh = nlh; + cfg->nlinfo.nl_net = net; + + cfg->nh_family = nhm->nh_family; + cfg->nh_protocol = nhm->nh_protocol; + cfg->nh_flags = nhm->nh_flags; + + if (tb[NHA_ID]) + cfg->nh_id = nla_get_u32(tb[NHA_ID]); + + if (tb[NHA_GROUP]) { + if (nhm->nh_family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid family for group"); + goto out; + } + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + err = nh_check_attr_group(net, tb, extack); + + /* no other attributes should be set */ + goto out; + } + + if (tb[NHA_BLACKHOLE]) { + if (tb[NHA_GATEWAY] || tb[NHA_OIF] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + goto out; + } + + cfg->nh_blackhole = 1; + err = 0; + goto out; + } + + if (!tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + goto out; + } + + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } + + err = -EINVAL; + if (tb[NHA_GATEWAY]) { + struct nlattr *gwa = tb[NHA_GATEWAY]; + + switch (cfg->nh_family) { + case AF_INET: + if (nla_len(gwa) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv4 = nla_get_be32(gwa); + break; + case AF_INET6: + if (nla_len(gwa) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv6 = nla_get_in6_addr(gwa); + break; + default: + NL_SET_ERR_MSG(extack, + "Unknown address family for gateway"); + goto out; + } + } else { + /* device only nexthop (no gateway) */ + if (cfg->nh_flags & RTNH_F_ONLINK) { + NL_SET_ERR_MSG(extack, + "ONLINK flag can not be set for nexthop without a gateway"); + goto out; + } + } + + if (tb[NHA_ENCAP]) { + cfg->nh_encap = tb[NHA_ENCAP]; + + if (!tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); + goto out; + } + + cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + if (err < 0) + goto out; + + } else if (tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); + goto out; + } + + + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nh_config cfg; + struct nexthop *nh; + int err; + + err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); + if (!err) { + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + } + + return err; +} + +static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err, i; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + for (i = 0; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_ID: + break; + default: + NL_SET_ERR_MSG_ATTR(extack, tb[i], + "Unexpected attribute in request"); + goto out; + } + } + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header"); + goto out; + } + + if (!tb[NHA_ID]) { + NL_SET_ERR_MSG(extack, "Nexthop id is missing"); + goto out; + } + + *id = nla_get_u32(tb[NHA_ID]); + if (!(*id)) + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + else + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nl_info nlinfo = { + .nlh = nlh, + .nl_net = net, + .portid = NETLINK_CB(skb).portid, + }; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return -ENOENT; + + remove_nexthop(net, nh, &nlinfo); + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *skb = NULL; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + err = -ENOBUFS; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + err = -ENOENT; + nh = nexthop_find_by_id(net, id); + if (!nh) + goto errout_free; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + return err; +errout_free: + kfree_skb(skb); + goto out; +} + +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, + bool group_filter, u8 family) +{ + const struct net_device *dev; + const struct nh_info *nhi; + + if (group_filter && !nh->is_group) + return true; + + if (!dev_idx && !master_idx && !family) + return false; + + if (nh->is_group) + return true; + + nhi = rtnl_dereference(nh->nh_info); + if (family && nhi->family != family) + return true; + + dev = nhi->fib_nhc.nhc_dev; + if (dev_idx && (!dev || dev->ifindex != dev_idx)) + return true; + + if (master_idx) { + struct net_device *master; + + if (!dev) + return true; + + master = netdev_master_upper_dev_get((struct net_device *)dev); + if (!master || master->ifindex != master_idx) + return true; + } + + return false; +} + +static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, + int *master_idx, bool *group_filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NHA_MAX + 1]; + struct nhmsg *nhm; + int err, i; + u32 idx; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + NULL); + if (err < 0) + return err; + + for (i = 0; i <= NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_OIF: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + *dev_idx = idx; + break; + case NHA_MASTER: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); + return -EINVAL; + } + *master_idx = idx; + break; + case NHA_GROUPS: + *group_filter = true; + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + nhm = nlmsg_data(nlh); + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); + return -EINVAL; + } + + return 0; +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + int dev_filter_idx = 0, master_idx = 0; + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + bool group_filter = false; + struct rb_node *node; + int idx = 0, s_idx; + int err; + + err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, + &group_filter, cb); + if (err < 0) + return err; + + s_idx = cb->args[0]; + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + if (idx < s_idx) + goto cont; + + nh = rb_entry(node, struct nexthop, rb_node); + if (nh_dump_filtered(nh, dev_filter_idx, master_idx, + group_filter, nhm->nh_family)) + goto cont; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } +cont: + idx++; + } + +out: + err = skb->len; +out_err: + cb->args[0] = idx; + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return err; +} + +static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev == dev) { + if (nhi->family == AF_INET) + fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, + orig_mtu); + } + } +} + +/* rtnl */ +static int nh_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_info_ext *info_ext; + + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGE: + if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGEMTU: + info_ext = ptr; + nexthop_sync_mtu(dev, info_ext->ext.mtu); + rt_cache_flush(dev_net(dev)); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nh_netdev_notifier = { + .notifier_call = nh_netdev_event, +}; + +static void __net_exit nexthop_net_exit(struct net *net) +{ + rtnl_lock(); + flush_all_nexthops(net); + rtnl_unlock(); + kfree(net->nexthop.devhash); +} + +static int __net_init nexthop_net_init(struct net *net) +{ + size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; + + net->nexthop.rb_root = RB_ROOT; + net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); + if (!net->nexthop.devhash) + return -ENOMEM; + + return 0; +} + +static struct pernet_operations nexthop_net_ops = { + .init = nexthop_net_init, + .exit = nexthop_net_exit, +}; + +static int __init nexthop_init(void) +{ + register_pernet_subsys(&nexthop_net_ops); + + register_netdevice_notifier(&nh_netdev_notifier); + + rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, + rtm_dump_nexthop, 0); + + rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + return 0; +} +subsys_initcall(nexthop_init); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 073273b751f8..cc90243ccf76 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -68,8 +68,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", - atomic_read(&net->ipv4.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv4.frags)); + atomic_read(&net->ipv4.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv4.fqdir)); return 0; } @@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), + SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ea0735a6754..a3e466b6a60c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -95,6 +95,7 @@ #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> @@ -1580,7 +1581,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID - { + if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); @@ -1962,6 +1963,23 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.basic.ip_proto = fl4->flowi4_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + /* skb is currently provided only when forwarding */ + if (skb) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, 0); + + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else { + /* Same as case 0 */ + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1979,7 +1997,7 @@ static int ip_mkroute_input(struct sk_buff *skb, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); @@ -2714,7 +2732,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4->flowi4_tos; + r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -2742,7 +2760,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (!rt_is_input_route(rt) && + if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; @@ -2782,36 +2800,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (fl4->flowi4_mark && - nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) - goto nla_put_failure; - - if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && - nla_put_u32(skb, RTA_UID, - from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) - goto nla_put_failure; + if (fl4) { + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - error = rt->dst.error; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), + fl4->flowi4_uid))) + goto nla_put_failure; - if (rt_is_input_route(rt)) { + if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, - fl4->saddr, fl4->daddr, - r, portid); - - if (err <= 0) { - if (err == 0) - return 0; - goto nla_put_failure; - } - } else + if (ipv4_is_multicast(dst) && + !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, portid); + + if (err <= 0) { + if (err == 0) + return 0; + goto nla_put_failure; + } + } else #endif - if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) - goto nla_put_failure; + if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) + goto nla_put_failure; + } } + error = rt->dst.error; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; @@ -2823,6 +2845,80 @@ nla_put_failure: return -EMSGSIZE; } +static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, u32 table_id, + struct fnhe_hash_bucket *bucket, int genid, + int *fa_index, int fa_start) +{ + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference(bucket[i].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + struct rtable *rt; + int err; + + if (*fa_index < fa_start) + goto next; + + if (fnhe->fnhe_genid != genid) + goto next; + + if (fnhe->fnhe_expires && + time_after(jiffies, fnhe->fnhe_expires)) + goto next; + + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (!rt) + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (!rt) + goto next; + + err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, + table_id, NULL, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err) + return err; +next: + (*fa_index)++; + } + } + + return 0; +} + +int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, + u32 table_id, struct fib_info *fi, + int *fa_index, int fa_start) +{ + struct net *net = sock_net(cb->skb->sk); + int nhsel, genid = fnhe_genid(net); + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); + struct fnhe_hash_bucket *bucket; + int err; + + if (nhc->nhc_flags & RTNH_F_DEAD) + continue; + + rcu_read_lock(); + bucket = rcu_dereference(nhc->nhc_exceptions); + err = 0; + if (bucket) + err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, + genid, fa_index, fa_start); + rcu_read_unlock(); + if (err) + return err; + } + + return 0; +} + static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b6f14af926fa..7d66306b5f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -279,55 +279,96 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } +static int sscanf_key(char *buf, __le32 *key) +{ + u32 user_key[4]; + int i, ret = 0; + + if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, + user_key + 2, user_key + 3) != 4) { + ret = -EINVAL; + } else { + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + } + pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", + user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); + + return ret; +} + static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); - struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; - struct tcp_fastopen_context *ctxt; - u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ - __le32 key[4]; - int ret, i; + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)) }; + struct tcp_fastopen_context *ctx; + u32 user_key[TCP_FASTOPEN_KEY_MAX * 4]; + __le32 key[TCP_FASTOPEN_KEY_MAX * 4]; + char *backup_data; + int ret, i = 0, off = 0, n_keys = 0; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); - if (ctxt) - memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); - else - memset(key, 0, sizeof(key)); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctx) { + n_keys = tcp_fastopen_context_len(ctx); + memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys); + } rcu_read_unlock(); - for (i = 0; i < ARRAY_SIZE(key); i++) + if (!n_keys) { + memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); + n_keys = 1; + } + + for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); - snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", - user_key[0], user_key[1], user_key[2], user_key[3]); + for (i = 0; i < n_keys; i++) { + off += snprintf(tbl.data + off, tbl.maxlen - off, + "%08x-%08x-%08x-%08x", + user_key[i * 4], + user_key[i * 4 + 1], + user_key[i * 4 + 2], + user_key[i * 4 + 3]); + if (i + 1 < n_keys) + off += snprintf(tbl.data + off, tbl.maxlen - off, ","); + } + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { - if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, - user_key + 2, user_key + 3) != 4) { + backup_data = strchr(tbl.data, ','); + if (backup_data) { + *backup_data = '\0'; + backup_data++; + } + if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } - - for (i = 0; i < ARRAY_SIZE(user_key); i++) - key[i] = cpu_to_le32(user_key[i]); - + if (backup_data) { + if (sscanf_key(backup_data, key + 4)) { + ret = -EINVAL; + goto bad_key; + } + } tcp_fastopen_reset_cipher(net, NULL, key, - TCP_FASTOPEN_KEY_LENGTH); + backup_data ? key + 4 : NULL); } bad_key: - pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], - (char *)tbl.data, ret); kfree(tbl.data); return ret; } @@ -956,7 +997,12 @@ static struct ctl_table ipv4_net_table[] = { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { @@ -984,7 +1030,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, - .extra2 = &one, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7dc9ab84bb69..47c217905864 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2741,6 +2741,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -2791,15 +2806,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; + __u8 *backup_key = NULL; - if (optlen != sizeof(key)) + /* Allow a backup key as well to facilitate key rotation + * First key is the active one. + */ + if (optlen != TCP_FASTOPEN_KEY_LENGTH && + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_user(key, optval, optlen)) return -EFAULT; - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ @@ -3083,6 +3106,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3453,21 +3481,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; struct tcp_fastopen_context *ctx; + unsigned int key_len = 0; if (get_user(len, optlen)) return -EFAULT; rcu_read_lock(); ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) - memcpy(key, ctx->key, sizeof(key)); - else - len = 0; + if (ctx) { + key_len = tcp_fastopen_context_len(ctx) * + TCP_FASTOPEN_KEY_LENGTH; + memcpy(&key[0], &ctx->key[0], key_len); + } rcu_read_unlock(); - len = min_t(unsigned int, len, sizeof(key)); + len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, key, len)) @@ -3540,6 +3570,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f5a45e1e1182..3fd451271a70 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -30,15 +30,15 @@ void tcp_fastopen_init_key_once(struct net *net) * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); - crypto_free_cipher(ctx->tfm); - kfree(ctx); + + kzfree(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) @@ -67,31 +67,27 @@ void tcp_fastopen_ctx_destroy(struct net *net) } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, - void *key, unsigned int len) + void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; - int err; + int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->tfm = crypto_alloc_cipher("aes", 0, 0); - - if (IS_ERR(ctx->tfm)) { - err = PTR_ERR(ctx->tfm); -error: kfree(ctx); - pr_err("TCP: TFO aes cipher alloc error: %d\n", err); - return err; + if (!ctx) { + err = -ENOMEM; + goto out; } - err = crypto_cipher_setkey(ctx->tfm, key, len); - if (err) { - pr_err("TCP: TFO cipher key error: %d\n", err); - crypto_free_cipher(ctx->tfm); - goto error; - } - memcpy(ctx->key, key, len); + ctx->key[0].key[0] = get_unaligned_le64(primary_key); + ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); + if (backup_key) { + ctx->key[1].key[0] = get_unaligned_le64(backup_key); + ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); + ctx->num = 2; + } else { + ctx->num = 1; + } spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { @@ -108,66 +104,58 @@ error: kfree(ctx); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); +out: return err; } -static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, + struct sk_buff *syn, + const siphash_key_t *key, + struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_context *ctx; - bool ok = false; - - rcu_read_lock(); - - ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); - if (!ctx) - ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); - if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); - foc->len = TCP_FASTOPEN_COOKIE_SIZE; - ok = true; - } - rcu_read_unlock(); - return ok; -} - -/* Generate the fastopen cookie by doing aes128 encryption on both - * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 - * addresses. For the longer IPv6 addresses use CBC-MAC. - * - * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. - */ -static bool tcp_fastopen_cookie_gen(struct sock *sk, - struct request_sock *req, - struct sk_buff *syn, - struct tcp_fastopen_cookie *foc) -{ if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); - __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(sk, path, foc); + foc->val[0] = cpu_to_le64(siphash(&iph->saddr, + sizeof(iph->saddr) + + sizeof(iph->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } - #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); - struct tcp_fastopen_cookie tmp; - - if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { - struct in6_addr *buf = &tmp.addr; - int i; - for (i = 0; i < 4; i++) - buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(sk, buf, foc); - } + foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, + sizeof(ip6h->saddr) + + sizeof(ip6h->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } #endif return false; } +/* Generate the fastopen cookie by applying SipHash to both the source and + * destination addresses. + */ +static void tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + struct tcp_fastopen_context *ctx; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (ctx) + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); + rcu_read_unlock(); +} /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. @@ -212,6 +200,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_fin(sk); } +/* returns 0 - no key match, 1 for primary, 2 for backup */ +static int tcp_fastopen_cookie_gen_check(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *orig, + struct tcp_fastopen_cookie *valid_foc) +{ + struct tcp_fastopen_cookie search_foc = { .len = -1 }; + struct tcp_fastopen_cookie *foc = valid_foc; + struct tcp_fastopen_context *ctx; + int i, ret = 0; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (!ctx) + goto out; + for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); + if (tcp_fastopen_cookie_match(foc, orig)) { + ret = i + 1; + goto out; + } + foc = &search_foc; + } +out: + rcu_read_unlock(); + return ret; +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -327,6 +344,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; + int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -342,31 +360,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; - if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && - foc->len == TCP_FASTOPEN_COOKIE_SIZE && - foc->len == valid_foc.len && - !memcmp(foc->val, valid_foc.val, foc->len)) { - /* Cookie is valid. Create a (full) child socket to accept - * the data in SYN before returning a SYN-ACK to ack the - * data. If we fail to create the socket, fall back and - * ack the ISN only but includes the same cookie. - * - * Note: Data-less SYN with valid cookie is allowed to send - * data in SYN_RECV state. - */ + if (foc->len == 0) { + /* Client requests a cookie. */ + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); + } else if (foc->len > 0) { + ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, + &valid_foc); + if (!ret) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + } else { + /* Cookie is valid. Create a (full) child socket to + * accept the data in SYN before returning a SYN-ACK to + * ack the data. If we fail to create the socket, fall + * back and ack the ISN only but includes the same + * cookie. + * + * Note: Data-less SYN with valid cookie is allowed to + * send data in SYN_RECV state. + */ fastopen: - child = tcp_fastopen_create_child(sk, skb, req); - if (child) { - foc->len = -1; + child = tcp_fastopen_create_child(sk, skb, req); + if (child) { + if (ret == 2) { + valid_foc.exp = foc->exp; + *foc = valid_foc; + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); + } else { + foc->len = -1; + } + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return child; + } NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); - return child; + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - + } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d95ee40df6c2..b71efeb0ae5b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { icsk->icsk_clean_acked = cad; - static_branch_inc(&clean_acked_data_enabled.key); + static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cfa81190a1b1..d57641cb3477 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - struct net *net; + u64 transmit_time = 0; struct sock *ctl_sk; + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -766,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -808,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct net *net = sock_net(sk); struct ip_reply_arg arg; struct sock *ctl_sk; + u64 transmit_time; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -858,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c35731816e2..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); + tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0ebc33d1c9e5..4af1f5dae9d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2239,6 +2241,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3217,6 +3231,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3248,13 +3263,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3297,8 +3313,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eed59c847722..1b971bd95786 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) -/* IPCB reference means this can not be used from early demux */ -static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -364,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -420,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, + int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -432,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, @@ -460,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - bool exact_dif = udp_lib_exact_dif_match(net, skb); hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -468,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); if (!result) { hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -476,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 9763464a75d7..a3908e55ed89 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -208,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, gso_skb->destructor = NULL; segs = skb_segment(gso_skb, features); - if (unlikely(IS_ERR_OR_NULL(segs))) { + if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) gso_skb->destructor = sock_wfree; return segs; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 081bb517e40d..521e3203e83a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + /* prefix routes only use builtin fib6_nh */ + if (rt->nh) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr *ifa; - int flag = scope; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - + in_dev_for_each_ifa_rtnl(ifa, in_dev) { addr.s6_addr32[3] = ifa->ifa_local; if (ifa->ifa_scope == RT_SCOPE_LINK) @@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + /* host routes only use builtin fib6_nh */ + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5352708b7b2d..7382a927d1eb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -208,7 +208,7 @@ lookup_protocol: np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect; + np->repflow = net->ipv6.sysctl.flowlabel_reflect & 1; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -922,6 +922,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375b4b4f9bf5..12906301ec7b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -75,9 +75,9 @@ * * On SMP we have one ICMP socket per-cpu. */ -static inline struct sock *icmpv6_sk(struct net *net) +static struct sock *icmpv6_sk(struct net *net) { - return *this_cpu_ptr(net->ipv6.icmp_sk); + return this_cpu_read(*net->ipv6.icmp_sk); } static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b2a55f300318..cf60fae9533b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, &in6addr_any, hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9180c8b6f764..49884f96232b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - + /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; @@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; @@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w) struct fib6_info *rt; for_each_fib6_walker_rt(w) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } + w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the @@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); @@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; @@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, + .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; @@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); - arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; } - /* fib entries are never clones */ - if (arg.filter.flags & RTM_F_CLONED) - goto out; - w = (void *)cb->args[2]; if (!w) { /* New dump: @@ -895,16 +897,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +struct fib6_nh_pcpu_arg { + struct fib6_info *from; + const struct fib6_table *table; +}; + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_pcpu_arg *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg->from, arg->table); + return 0; +} + +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + struct fib6_nh_pcpu_arg arg = { + .from = f6i, + .table = table + }; + + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, + &arg); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); + + if (rt->nh && !list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1101,11 +1147,13 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); @@ -1130,11 +1178,13 @@ add: return -ENOENT; } - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_REPLACE, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); @@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { + if (rt->nh) + list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, if (plen == fn->fn_bit) return fn; - prev = fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; next: /* @@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + fib6_info_release(rt); } @@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; @@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 21efcd02f337..8e49fd62eea9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -766,91 +891,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09dd2edfb868..083cc1c94cd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - + /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); - if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..61819ed858b1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, +#if IS_ENABLED(CONFIG_SYN_COOKIES) + .cookie_init_sequence = __cookie_v6_init_sequence, + .cookie_v6_check = __cookie_v6_check, +#endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + .br_defrag = nf_ct_frag6_gather, +#endif +#if IS_MODULE(CONFIG_IPV6) + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 41325d517478..e77ea1ed5edd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -3,272 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 84322ce81d70..398e1df41406 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -54,26 +54,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } +static void nf_ct_net_pre_exit(struct net *net) +{ + fqdir_pre_exit(net->nf_frag.fqdir); +} + static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { - .init = nf_ct_net_init, - .exit = nf_ct_net_exit, + .init = nf_ct_net_init, + .pre_exit = nf_ct_net_pre_exit, + .exit = nf_ct_net_exit, }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..bbff3e02e302 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2b2c0c38b87..ca05b16f1bb9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -250,7 +248,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } +static void __net_exit ipv6_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv6.fqdir); +} + static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { - .init = ipv6_frags_init_net, - .exit = ipv6_frags_exit_net, + .init = ipv6_frags_init_net, + .pre_exit = ipv6_frags_pre_exit_net, + .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { @@ -587,8 +585,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a843cf164c..7556275b1cef 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); -static size_t rt6_nlmsg_size(struct fib6_info *rt); +static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if (!match->fib6_nsiblings || have_oif_match) + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ - if (!fl6->mp_hash) + if (!fl6->mp_hash && + (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (unlikely(match->nh)) { + nexthop_path_fib6_result(res, fl6->mp_hash); + return; + } + + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, return false; } +struct fib6_nh_dm_arg { + struct net *net; + const struct in6_addr *saddr; + int oif; + int flags; + struct fib6_nh *nh; +}; + +static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_dm_arg *arg = _arg; + + arg->nh = nh; + return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, + arg->flags); +} + +/* returns fib6_nh from nexthop or NULL */ +static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, + struct fib6_result *res, + const struct in6_addr *saddr, + int oif, int flags) +{ + struct fib6_nh_dm_arg arg = { + .net = net, + .saddr = saddr, + .oif = oif, + .flags = flags, + }; + + if (nexthop_is_blackhole(nh)) + return NULL; + + if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) + return arg.nh; + + return NULL; +} + static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { @@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; - if (__rt6_device_match(net, nh, saddr, oif, flags)) { + bool matched = false; + + if (unlikely(spf6i->nh)) { + nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, + oif, flags); + if (nh) + matched = true; + } else { + nh = spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + matched = true; + } + if (matched) { res->f6i = spf6i; goto out; } @@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } + if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; + return; + +out_blackhole: + res->fib6_flags |= RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -691,6 +766,24 @@ out: return rc; } +struct fib6_nh_frl_arg { + u32 flags; + int oif; + int strict; + int *mpri; + bool *do_rr; + struct fib6_nh *nh; +}; + +static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_frl_arg *arg = _arg; + + arg->nh = nh; + return find_match(nh, arg->flags, arg->oif, arg->strict, + arg->mpri, arg->do_rr); +} + static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, @@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { + bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { @@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; - if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + if (unlikely(f6i->nh)) { + struct fib6_nh_frl_arg arg = { + .flags = f6i->fib6_flags, + .oif = oif, + .strict = strict, + .mpri = mpri, + .do_rr = do_rr + }; + + if (nexthop_is_blackhole(f6i->nh)) { + res->fib6_flags = RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->f6i = f6i; + res->nh = nexthop_fib6_nh(f6i->nh); + return; + } + if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, + &arg)) { + matched = true; + nh = arg.nh; + } + } else { + nh = f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, + mpri, do_rr)) + matched = true; + } + if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; @@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1114,6 +1234,8 @@ restart: rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; + } else if (res.fib6_flags & RTF_REJECT) { + goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, @@ -1125,6 +1247,7 @@ restart: if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { +do_create: rt = ip6_create_rt_rcu(&res); } @@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { - struct rt6_info *pcpu_rt, **p; - - p = this_cpu_ptr(res->f6i->rt6i_pcpu); - pcpu_rt = *p; + struct rt6_info *pcpu_rt; - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); + pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); return pcpu_rt; } @@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1539,7 +1707,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_info *f6i = arg; + + fib6_nh_flush_exceptions(nh, f6i); + + return 0; +} + +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + if (f6i->nh) + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, + f6i); + else + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1612,25 +1801,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +struct fib6_nh_excptn_arg { + struct rt6_info *rt; + int plen; +}; + +static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_excptn_arg *arg = _arg; + int err; + + err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); + if (err == 0) + return 1; + + return 0; +} + +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + if (from->nh) { + struct fib6_nh_excptn_arg arg = { + .rt = rt, + .plen = from->fib6_src.plen + }; + int rc; + /* rc = 1 means an entry was found */ + rc = nexthop_for_each_fib6_nh(from->nh, + rt6_nh_remove_exception_rt, + &arg); + return rc ? 0 : -ENOENT; + } + + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} +struct fib6_nh_match_arg { + const struct net_device *dev; + const struct in6_addr *gw; + struct fib6_nh *match; +}; + +/* determine if fib6_nh has given device and gateway */ +static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_match_arg *arg = _arg; + + if (arg->dev != nh->fib_nh_dev || + (arg->gw && !nh->fib_nh_gw_family) || + (!arg->gw && nh->fib_nh_gw_family) || + (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) + return 0; + + arg->match = nh; + + /* found a match, break the loop */ + return 1; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + struct fib6_nh *fib6_nh; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + + if (from->nh) { + struct fib6_nh_match_arg arg = { + .dev = rt->dst.dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); + + if (!arg.match) + return; + fib6_nh = arg.match; + } else { + fib6_nh = from->fib6_nh; + } + fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +struct fib6_nh_age_excptn_arg { + struct fib6_gc_args *gc_args; + unsigned long now; +}; + +static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_age_excptn_arg *arg = _arg; + + fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); + return 0; +} + +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + if (f6i->nh) { + struct fib6_nh_age_excptn_arg arg = { + .gc_args = gc_args, + .now = now + }; + + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, + &arg); + } else { + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); + } +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - - rcu_read_unlock(); + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2090,11 +2376,12 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2116,8 +2403,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2129,8 +2416,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2138,6 +2426,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2145,6 +2434,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2157,6 +2447,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2381,10 +2693,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt6->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev + gw. Should be impossible. + */ + if (!arg.match) { + rcu_read_unlock(); + return; + } + + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2491,6 +2824,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res, return true; } +struct fib6_nh_rd_arg { + struct fib6_result *res; + struct flowi6 *fl6; + const struct in6_addr *gw; + struct rt6_info **ret; +}; + +static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_rd_arg *arg = _arg; + + arg->res->nh = nh; + return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2506,6 +2854,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; + struct fib6_nh_rd_arg arg = { + .res = &res, + .fl6 = fl6, + .gw = &rdfl->gateway, + .ret = &ret + }; struct fib6_info *rt; struct fib6_node *fn; @@ -2530,14 +2884,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; - if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) - goto out; + if (unlikely(rt->nh)) { + if (nexthop_is_blackhole(rt->nh)) + continue; + /* on match, res->nh is filled in and potentially ret */ + if (nexthop_for_each_fib6_nh(rt->nh, + fib6_nh_redirect_match, + &arg)) + goto out; + } else { + res.nh = rt->fib6_nh; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, + &ret)) + goto out; + } } if (!rt) @@ -2554,7 +2918,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -2781,10 +3145,9 @@ out: return entries > rt_max_size; } -static struct rt6_info *ip6_nh_lookup_table(struct net *net, - struct fib6_config *cfg, - const struct in6_addr *gw_addr, - u32 tbid, int flags) +static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, + const struct in6_addr *gw_addr, u32 tbid, + int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2792,25 +3155,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; - struct rt6_info *rt; + int err; table = fib6_get_table(net, tbid); if (!table) - return NULL; + return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); - /* if table lookup failed, fall back to full lookup */ - if (rt == net->ipv6.ip6_null_entry) { - ip6_rt_put(rt); - rt = NULL; - } + err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) + fib6_select_path(net, res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); - return rt; + return err; } static int ip6_route_check_nh_onlink(struct net *net, @@ -2818,29 +3179,19 @@ static int ip6_route_check_nh_onlink(struct net *net, const struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; - u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; - struct fib6_info *from; - struct rt6_info *grt; + struct fib6_result res = {}; int err; - err = 0; - grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); - if (grt) { - rcu_read_lock(); - from = rcu_dereference(grt->from); - if (!grt->dst.error && - /* ignore match if it is the default route */ - from && !ipv6_addr_any(&from->fib6_dst.addr) && - (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); - err = -EINVAL; - } - rcu_read_unlock(); - - ip6_rt_put(grt); + err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); + if (!err && !(res.fib6_flags & RTF_REJECT) && + /* ignore match if it is the default route */ + !ipv6_addr_any(&res.f6i->fib6_dst.addr) && + (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); + err = -EINVAL; } return err; @@ -2853,47 +3204,50 @@ static int ip6_route_check_nh(struct net *net, { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; - struct rt6_info *grt = NULL; + int flags = RT6_LOOKUP_F_IFACE; + struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { - int flags = RT6_LOOKUP_F_IFACE; - - grt = ip6_nh_lookup_table(net, cfg, gw_addr, - cfg->fc_table, flags); - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } + err = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags, &res); + /* gw_addr can not require a gateway or resolve to a reject + * route. If a device is given, it must match the result. + */ + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family || + (dev && dev != res.nh->fib_nh_dev)) + err = -EHOSTUNREACH; } - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); + if (err < 0) { + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + }; + + err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family) + err = -EHOSTUNREACH; - if (!grt) - goto out; + if (err) + return err; + + fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); + } + err = 0; if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (dev != res.nh->fib_nh_dev) + err = -EHOSTUNREACH; } else { - *_dev = dev = grt->dst.dev; - *idev = grt->rt6i_idev; + *_dev = dev = res.nh->fib_nh_dev; dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + *idev = in6_dev_get(dev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - - ip6_rt_put(grt); - -out: return err; } @@ -2934,11 +3288,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, goto out; } + rcu_read_lock(); + if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, idev); + rcu_read_unlock(); + if (err) goto out; } @@ -3039,7 +3397,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; } } - goto set_dev; + goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { @@ -3075,7 +3433,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; -set_dev: + +pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; @@ -3095,6 +3460,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3104,7 +3501,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; + struct nexthop *nh = NULL; struct fib6_table *table; + struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; @@ -3140,6 +3539,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif + if (cfg->fc_nh_id) { + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out; + } + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out; + } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3157,7 +3566,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; @@ -3197,19 +3606,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); - if (err) - goto out; + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + goto out; + } + if (rt->fib6_src.plen) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto out; + } + rt->nh = nh; + fib6_nh = nexthop_fib6_nh(rt->nh); + } else { + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes - */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + fib6_nh = rt->fib6_nh; + + /* We cannot add true routes via loopback here, they would + * result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, + addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -3301,6 +3726,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + info->skip_notify_kernel = 1; + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, + rt->fib6_nsiblings, + NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -3323,7 +3754,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3339,10 +3770,49 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + +struct fib6_nh_del_cached_rt_arg { + struct fib6_config *cfg; + struct fib6_info *f6i; +}; + +static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_del_cached_rt_arg *arg = _arg; + int rc; + + rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); + return rc != -ESRCH ? rc : 0; +} + +static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) +{ + struct fib6_nh_del_cached_rt_arg arg = { + .cfg = cfg, + .f6i = f6i + }; + + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3365,26 +3835,45 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + if (rt->nh && cfg->fc_nh_id && + rt->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; - int rc; - - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + int rc = 0; + + if (rt->nh) { + rc = ip6_del_cached_rt_nh(cfg, rt); + } else if (cfg->fc_nh_id) { + continue; + } else { + nh = rt->fib6_nh; + rc = ip6_del_cached_rt(cfg, rt, nh); + } + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) + continue; + if (cfg->fc_protocol && + cfg->fc_protocol != rt->fib6_protocol) + continue; + + if (rt->nh) { + if (!fib6_info_hold_safe(rt)) + continue; + rcu_read_unlock(); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + if (cfg->fc_nh_id) + continue; + + nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3392,10 +3881,6 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) - continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) - continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); @@ -3506,7 +3991,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev. Should be impossible + */ + if (!arg.match) + goto out; + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3558,12 +4061,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + /* these routes do not use nexthops */ + if (rt->nh) + continue; + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3621,8 +4127,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh; + /* RA routes do not use nexthops */ + if (rt->nh) + continue; + + nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) @@ -3873,7 +4384,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (!rt->nh && + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3901,18 +4413,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh; + + /* RA routes do not use nexthops */ + if (rt->nh) + return 0; + nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3950,11 +4466,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) return NULL; } +/* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3966,11 +4483,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3981,11 +4498,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4028,9 +4545,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && !rt->nh && + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4053,15 +4570,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +/* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4082,12 +4600,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4099,11 +4617,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4113,17 +4631,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.fib6_null_entry) + if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4139,10 +4657,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4176,9 +4694,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4193,24 +4738,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); + arg->f6i = f6i; + if (f6i->nh) { + /* fib6_nh_mtu_change only returns 0, so this is safe */ + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, + arg); } - return 0; + + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4224,6 +4762,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4241,6 +4780,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4287,6 +4827,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + if (tb[RTA_NH_ID]) { + if (tb[RTA_GATEWAY] || tb[RTA_OIF] || + tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + goto errout; + } + cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); + } + if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4430,6 +4980,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -4489,7 +5040,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4501,12 +5052,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } + if (list_empty(&rt6_nh_list)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; + /* For add and replace, send one notification with all nexthops. For + * append, send one notification with all appended nexthops. + */ + info->skip_notify_kernel = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); @@ -4543,6 +5105,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; + err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, + rt_notif, nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; @@ -4621,6 +5192,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_nh_id && + !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } + if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { @@ -4648,17 +5225,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct fib6_info *rt) +/* add the overhead of this fib6_nh to nexthop_len */ +static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) +{ + int *nexthop_len = arg; + + *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16); /* RTA_GATEWAY */ + + if (nh->fib_nh_lws) { + /* RTA_ENCAP_TYPE */ + *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); + /* RTA_ENCAP */ + *nexthop_len += nla_total_size(2); + } + + return 0; +} + +static size_t rt6_nlmsg_size(struct fib6_info *f6i) { - int nexthop_len = 0; + int nexthop_len; + + if (f6i->nh) { + nexthop_len = nla_total_size(4); /* RTA_NH_ID */ + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, + &nexthop_len); + } else { + struct fib6_nh *nh = f6i->fib6_nh; - if (rt->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + nexthop_len = 0; + if (f6i->fib6_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + lwtunnel_get_encap_size(nh->fib_nh_lws); - nexthop_len *= rt->fib6_nsiblings; + nexthop_len *= f6i->fib6_nsiblings; + } + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4674,10 +5280,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, + unsigned char *flags) +{ + if (nexthop_is_multipath(nh)) { + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (nexthop_mpath_fill_node(skb, nh)) + goto nla_put_failure; + + nla_nest_end(skb, mp); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = nexthop_fib6_nh(nh); + if (fib_nexthop_info(skb, &fib6_nh->nh_common, + flags, false) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4687,6 +5321,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt6 = (struct rt6_info *)dst; struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; + unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; @@ -4794,22 +5429,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); - } else { - unsigned char nh_flags = 0; + } else if (rt->nh) { + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) + goto nla_put_failure; + + if (nexthop_is_blackhole(rt->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + goto nla_put_failure; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + rtm->rtm_flags |= nh_flags; + } else { + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4836,10 +5480,28 @@ nla_put_failure: return -EMSGSIZE; } +static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + if (nh->fib_nh_dev == dev) + return 1; + + return 0; +} + static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->nh) { + struct net_device *_dev = (struct net_device *)dev; + + return !!nexthop_for_each_fib6_nh(f6i->nh, + fib6_info_nh_uses_dev, + _dev); + } + + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4847,7 +5509,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -4855,33 +5517,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, return false; } -int rt6_dump_route(struct fib6_info *rt, void *p_arg) +struct fib6_nh_exception_dump_walker { + struct rt6_rtnl_dump_arg *dump; + struct fib6_info *rt; + unsigned int flags; + unsigned int skip; + unsigned int count; +}; + +static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_nh_exception_dump_walker *w = arg; + struct rt6_rtnl_dump_arg *dump = w->dump; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i, err; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); + if (!bucket) + return 0; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (w->skip) { + w->skip--; + continue; + } + + /* Expiration of entries doesn't bump sernum, insertion + * does. Removal is triggered by insertion, so we can + * rely on the fact that if entries change between two + * partial dumps, this node is scanned again completely, + * see rt6_insert_exception() and fib6_dump_table(). + * + * Count expired entries we go through as handled + * entries that we'll skip next time, in case of partial + * node dump. Otherwise, if entries expire meanwhile, + * we'll skip the wrong amount. + */ + if (rt6_check_expired(rt6_ex->rt6i)) { + w->count++; + continue; + } + + err = rt6_fill_node(dump->net, dump->skb, w->rt, + &rt6_ex->rt6i->dst, NULL, NULL, 0, + RTM_NEWROUTE, + NETLINK_CB(dump->cb->skb).portid, + dump->cb->nlh->nlmsg_seq, w->flags); + if (err) + return err; + + w->count++; + } + bucket++; + } + + return 0; +} + +/* Return -1 if done with node, number of handled routes on partial dump */ +int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; + int count = 0; if (rt == net->ipv6.fib6_null_entry) - return 0; + return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ - return 1; + return -1; } - if (filter->filter_set) { - if ((filter->rt_type && rt->fib6_type != filter->rt_type) || - (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || - (filter->protocol && rt->fib6_protocol != filter->protocol)) { - return 1; - } + if (filter->filter_set && + ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol))) { + return -1; + } + + if (filter->filter_set || + !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } - return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, - RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, flags); + if (filter->dump_routes) { + if (skip) { + skip--; + } else { + if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, + 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, flags)) { + return 0; + } + count++; + } + } + + if (filter->dump_exceptions) { + struct fib6_nh_exception_dump_walker w = { .dump = arg, + .rt = rt, + .flags = flags, + .skip = skip, + .count = 0 }; + int err; + + rcu_read_lock(); + if (rt->nh) { + err = nexthop_for_each_fib6_nh(rt->nh, + rt6_nh_dump_exceptions, + &w); + } else { + err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); + } + rcu_read_unlock(); + + if (err) + return count += w.count; + } + + return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, @@ -5126,6 +5886,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5136,7 +5928,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5330,11 +6122,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5344,6 +6136,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -5355,6 +6148,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -5364,6 +6158,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; @@ -5471,7 +6266,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e15cd37024fd..6d86fac472e7 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ static int zero; static int one = 1; +static int three = 3; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -114,6 +115,8 @@ static struct ctl_table ipv6_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &three, }, { .procname = "max_dst_opts_number", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a14ea37d2df..408d9ec26971 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - if (sk) - mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) { + mark = inet_twsk(sk)->tw_mark; + /* autoflowlabel relies on buff->hash */ + skb_set_hash(buff, inet_twsk(sk)->tw_txhash, + PKT_HASH_TYPE_L4); + } else { + mark = sk->sk_mark; + } + buff->tstamp = tcp_transmit_time(sk); + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif + __be32 label = 0; + struct net *net; int oif = 0; if (th->rst) @@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); @@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = inet6_lookup_listener(net, &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, @@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) trace_tcp_send_reset(sk, skb); + if (sk->sk_state == TCP_TIME_WAIT) + label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + } else { + if (net->ipv6.sysctl.flowlabel_reflect & 2) + label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, + label); #ifdef CONFIG_TCP_MD5SIG out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70b01bd95022..66ca5a4b17c4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,16 +54,6 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, - struct udp_hslot *hslot2, struct sk_buff *skb) + int dif, int sdif, struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness; @@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, @@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result; - bool exact_dif = udp6_lib_exact_dif_match(net, skb); hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, + daddr, hnum, dif, sdif, hslot2, skb); if (!result) { hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); @@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, - exact_dif, hslot2, - skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/key/af_key.c b/net/key/af_key.c index a50dd6f34b91..39b3d95094eb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -928,8 +928,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); @@ -944,8 +943,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 6e2b4b9267e1..35bb4f3bdbe0 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -31,7 +31,6 @@ #include "l2tp_core.h" static struct dentry *rootdir; -static struct dentry *tunnels; struct l2tp_dfs_seq_data { struct net *net; @@ -326,32 +325,18 @@ static const struct file_operations l2tp_dfs_fops = { static int __init l2tp_debugfs_init(void) { - int rc = 0; - rootdir = debugfs_create_dir("l2tp", NULL); - if (IS_ERR(rootdir)) { - rc = PTR_ERR(rootdir); - rootdir = NULL; - goto out; - } - tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); - if (tunnels == NULL) - rc = -EIO; + debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); pr_info("L2TP debugfs support\n"); -out: - if (rc) - pr_warn("unable to init\n"); - - return rc; + return 0; } static void __exit l2tp_debugfs_exit(void) { - debugfs_remove(tunnels); - debugfs_remove(rootdir); + debugfs_remove_recursive(rootdir); } module_init(l2tp_debugfs_init); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index cfc9fcb97465..f35899d45a9a 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup + * This function does not hold refcnt on the returned dst. + * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, @@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct dst_entry *dst = NULL; struct net_device *dev; + WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); @@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); - - rcu_read_unlock(); } return dst; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 5d2d1f746b91..3c03f6512c5f 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -68,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb) lapb_put(lapb); } } -EXPORT_SYMBOL(lapb_register); /* * Add a socket to the bound sockets list. @@ -115,7 +114,6 @@ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); - if (!lapb) goto out; @@ -167,6 +165,7 @@ out: write_unlock_bh(&lapb_list_lock); return rc; } +EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a1973a26c7fc..4f12d042c89c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,6 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018-2019 Intel Corporation * Copyright (C) 2018 Intel Corporation */ @@ -974,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_BEACON | BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_TWT; int err; int prev_beacon_int; @@ -1044,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.dtim_period = params->dtim_period; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; + sdata->vif.bss_conf.twt_responder = params->twt_responder; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1465,7 +1468,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, return ret; } - if (params->supported_rates) { + if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, sband, params->supported_rates, params->supported_rates_len, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 271bc2b676a4..2e7f75938c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -272,6 +272,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(EXT_KEY_ID_NATIVE), + FLAG(NO_AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 3509ce0daea3..7b8735ced2a1 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -339,9 +339,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); - if (!key->debugfs.dir) - return; - sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f1f2e1c7ac0c..b1438fd4d876 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -815,9 +815,8 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) sprintf(buf, "netdev:%s", sdata->name); sdata->vif.debugfs_dir = debugfs_create_dir(buf, sdata->local->hw.wiphy->debugfsdir); - if (sdata->vif.debugfs_dir) - sdata->debugfs.subdir_stations = debugfs_create_dir("stations", - sdata->vif.debugfs_dir); + sdata->debugfs.subdir_stations = debugfs_create_dir("stations", + sdata->vif.debugfs_dir); add_files(sdata); } @@ -842,8 +841,5 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) return; sprintf(buf, "netdev:%s", sdata->name); - if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf)) - sdata_err(sdata, - "debugfs: failed to rename debugfs dir to %s\n", - buf); + debugfs_rename(dir->d_parent, dir, dir->d_parent, buf); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3fd79ccb293b..c8ad20c28c43 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -957,8 +957,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) * dir might still be around. */ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); - if (!sta->debugfs_dir) - return; DEBUGFS_ADD(flags); DEBUGFS_ADD(aid); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 157ff5f890d2..dd60f6428049 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -269,50 +269,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) assert_key_lock(local); sta->ptk_idx = key->conf.keyidx; + + if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT)) + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } -static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, - struct ieee80211_key *new_key, - bool pairwise) +static void ieee80211_pairwise_rekey(struct ieee80211_key *old, + struct ieee80211_key *new) { - struct ieee80211_sub_if_data *sdata; - struct ieee80211_local *local; - struct sta_info *sta; - int ret; - - /* Aggregation sessions are OK when running on SW crypto. - * A broken remote STA may cause issues not observed with HW - * crypto, though. - */ - if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) - return 0; + struct ieee80211_local *local = new->local; + struct sta_info *sta = new->sta; + int i; - assert_key_lock(old_key->local); - sta = old_key->sta; + assert_key_lock(local); - /* Unicast rekey without Extended Key ID needs special handling */ - if (new_key && sta && pairwise && - rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) { - local = old_key->local; - sdata = old_key->sdata; + if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { + /* Extended Key ID key install, initial one or rekey */ + + if (sta->ptk_idx != INVALID_PTK_KEYIDX && + ieee80211_hw_check(&local->hw, + NO_AMPDU_KEYBORDER_SUPPORT)) { + /* Aggregation Sessions with Extended Key ID must not + * mix MPDUs with different keyIDs within one A-MPDU. + * Tear down any running Tx aggregation and all new + * Rx/Tx aggregation request during rekey if the driver + * asks us to do so. (Blocking Tx only would be + * sufficient but WLAN_STA_BLOCK_BA gets the job done + * for the few ms we need it.) + */ + set_sta_flag(sta, WLAN_STA_BLOCK_BA); + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + ___ieee80211_stop_tx_ba_session(sta, i, + AGG_STOP_LOCAL_REQUEST); + mutex_unlock(&sta->ampdu_mlme.mtx); + } + } else if (old) { + /* Rekey without Extended Key ID. + * Aggregation sessions are OK when running on SW crypto. + * A broken remote STA may cause issues not observed with HW + * crypto, though. + */ + if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + return; - /* Stop TX till we are on the new key */ - old_key->flags |= KEY_FLAG_TAINTED; + /* Stop Tx till we are on the new key */ + old->flags |= KEY_FLAG_TAINTED; ieee80211_clear_fast_xmit(sta); - - /* Aggregation sessions during rekey are complicated due to the - * reorder buffer and retransmits. Side step that by blocking - * aggregation during rekey and tear down running sessions. - */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_LOCAL_REQUEST); } - if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", @@ -320,18 +331,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, /* Flushing the driver queues *may* help prevent * the clear text leaks and freezes. */ - ieee80211_flush_queues(local, sdata, false); + ieee80211_flush_queues(local, old->sdata, false); } } - - ieee80211_key_disable_hw_accel(old_key); - - if (new_key) - ret = ieee80211_key_enable_hw_accel(new_key); - else - ret = 0; - - return ret; } static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, @@ -389,7 +391,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, mutex_unlock(&sdata->local->key_mtx); } - static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, bool pairwise, @@ -397,7 +398,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct ieee80211_key *new) { int idx; - int ret; + int ret = 0; bool defunikey, defmultikey, defmgmtkey; /* caller must provide at least one old/new */ @@ -409,16 +410,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); + if (new && sta && pairwise) { + /* Unicast rekey needs special handling. With Extended Key ID + * old is still NULL for the first rekey. + */ + ieee80211_pairwise_rekey(old, new); + } + if (old) { idx = old->conf.keyidx; - ret = ieee80211_hw_key_replace(old, new, pairwise); + + if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + ieee80211_key_disable_hw_accel(old); + + if (new) + ret = ieee80211_key_enable_hw_accel(new); + } } else { /* new must be provided in case old is not */ idx = new->conf.keyidx; if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); - else - ret = 0; } if (ret) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 55583b71ffaf..85e416248753 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -351,11 +351,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata_lock(sdata); /* Copy the addresses to the bss_conf list */ - ifa = idev->ifa_list; + ifa = rtnl_dereference(idev->ifa_list); while (ifa) { if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN) bss_conf->arp_addr_list[c] = ifa->ifa_address; - ifa = ifa->ifa_next; + ifa = rtnl_dereference(ifa->ifa_next); c++; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 379d2ab6d327..96014f459a2b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3155,6 +3155,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta, IEEE80211_HE_MAC_CAP0_TWT_RES; } +static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee802_11_elems *elems) +{ + bool twt = ieee80211_twt_req_supported(sta, elems); + + if (sdata->vif.bss_conf.twt_requester != twt) { + sdata->vif.bss_conf.twt_requester = twt; + return BSS_CHANGED_TWT; + } + return 0; +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -3337,8 +3350,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - bss_conf->twt_requester = - ieee80211_twt_req_supported(sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3998,6 +4010,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, @@ -4948,7 +4962,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, basic_rates = BIT(min_rate_index); } - new_sta->sta.supp_rates[cbss->channel->band] = rates; + if (rates) + new_sta->sta.supp_rates[cbss->channel->band] = rates; + else + sdata_info(sdata, + "No rates found, keeping mandatory only\n"); + sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 6e5961d7f639..60ef8972b254 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -199,6 +199,10 @@ static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); + else + cfg80211_tx_mgmt_expired(&roc->sdata->wdev, + roc->mgmt_tx_cookie, + roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 47ee36677c2b..a1e9fc7878aa 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -354,8 +354,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, break; } WARN_ONCE(i == sband->n_bitrates, - "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n", + "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", + sta ? sta->addr : NULL, sta ? sta->supp_rates[sband->band] : -1, + sband->band, rate_mask, rate_flags); info->control.rates[0].count = @@ -366,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, } -bool rate_control_send_low(struct ieee80211_sta *pubsta, - void *priv_sta, - struct ieee80211_tx_rate_control *txrc) +static bool rate_control_send_low(struct ieee80211_sta *pubsta, + struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; @@ -376,7 +377,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) { + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -402,7 +403,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, } return false; } -EXPORT_SYMBOL(rate_control_send_low); static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { @@ -885,26 +885,29 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; - if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { - ista = &sta->sta; - priv_sta = sta->rate_ctrl_priv; - } - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } + if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) + return; + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; + if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { + ista = &sta->sta; + priv_sta = sta->rate_ctrl_priv; + } + if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { - ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index a34e9c2ca626..ee86c3333999 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, int delta; int sampling_ratio; - /* management/no-ack frames do not use rate control */ - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - /* check multi-rate-retry capabilities & adjust lookaround_rate */ mrr_capable = mp->has_mrr && !txrc->rts && diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 298a1acb3ce5..5a882da82f0e 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1093,9 +1093,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct minstrel_priv *mp = priv; int sample_idx; - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - if (!msp->is_ht) return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 187f62a48b2b..95eb8220e2e4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation */ #include <linux/module.h> @@ -401,6 +401,47 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); + for (i = 0; i < NUM_NL80211_BANDS; i++) { + u32 mandatory = 0; + int r; + + if (!hw->wiphy->bands[i]) + continue; + + switch (i) { + case NL80211_BAND_2GHZ: + /* + * We use both here, even if we cannot really know for + * sure the station will support both, but the only use + * for this is when we don't know anything yet and send + * management frames, and then we'll pick the lowest + * possible rate anyway. + * If we don't include _G here, we cannot find a rate + * in P2P, and thus trigger the WARN_ONCE() in rate.c + */ + mandatory = IEEE80211_RATE_MANDATORY_B | + IEEE80211_RATE_MANDATORY_G; + break; + case NL80211_BAND_5GHZ: + mandatory = IEEE80211_RATE_MANDATORY_A; + break; + case NL80211_BAND_60GHZ: + WARN_ON(1); + mandatory = 0; + break; + } + + for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { + struct ieee80211_rate *rate; + + rate = &hw->wiphy->bands[i]->bitrates[r]; + + if (!(rate->flags & mandatory)) + continue; + sta->sta.supp_rates[i] |= BIT(r); + } + } + sta->sta.smps_mode = IEEE80211_SMPS_OFF; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b96fd3f54705..817a9e5d16e4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); - -int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) -{ - if (writable_len > skb->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (!skb_cloned(skb)) { - if (writable_len <= skb_headlen(skb)) - return 1; - } else if (skb_clone_writable(skb, writable_len)) - return 1; - - if (writable_len <= skb_headlen(skb)) - writable_len = 0; - else - writable_len -= skb_headlen(skb); - - return !!__pskb_pull_tail(skb, writable_len); -} -EXPORT_SYMBOL(skb_make_writable); - /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8acc4e173167..063df74b4647 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e3884b0cca91..11ff9d4a7006 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:ip type */ @@ -28,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index b73c37b3a791..ca7ac4a25ada 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -2,7 +2,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index d8c140553379..704a0dda1609 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:port type */ @@ -23,7 +22,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3cdf171cd468..2e151856ad99 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ @@ -48,7 +48,7 @@ static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); @@ -1290,11 +1290,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; + int ret; - /* Second pass, so parser can't fail */ - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, ip_set_setname_policy, - NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_setname_policy, NULL); + if (ret) + return ret; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { @@ -1541,10 +1543,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, + ip_set_adt_policy, NULL); + if (ret) { + nlmsg_free(skb2); + return ret; + } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; @@ -1558,10 +1564,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_ad(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + enum ipset_adt adt, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1590,18 +1598,17 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1610,56 +1617,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, return ret; } -static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int ip_set_uadd(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { - struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; - const struct nlattr *nla; - u32 flags = flag_exist(nlh); - bool use_lineno; - int ret = 0; - - if (unlikely(protocol_min_failed(attr) || - !attr[IPSET_ATTR_SETNAME] || - !((attr[IPSET_ATTR_DATA] != NULL) ^ - (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] && - !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] && - (!flag_nested(attr[IPSET_ATTR_ADT]) || - !attr[IPSET_ATTR_LINENO])))) - return -IPSET_ERR_PROTOCOL; - - set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (!set) - return -ENOENT; - - use_lineno = !!attr[IPSET_ATTR_LINENO]; - if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, - use_lineno); - } else { - int nla_rem; + return ip_set_ad(net, ctnl, skb, + IPSET_ADD, nlh, attr, extack); +} - nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(*tb)); - if (nla_type(nla) != IPSET_ATTR_DATA || - !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, - flags, use_lineno); - if (ret < 0) - return ret; - } - } - return ret; +static int ip_set_udel(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + return ip_set_ad(net, ctnl, skb, + IPSET_DEL, nlh, attr, extack); } static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2384e36aef5c..2b8f959574b4 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ /* Get Layer-4 data from the packets */ diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 10f619625abd..0feb77fa9edc 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_HASH_GEN_H #define _IP_SET_HASH_GEN_H @@ -622,7 +621,7 @@ retry: goto cleanup; } m->size = AHASH_INIT_SIZE; - extsize = ext_size(AHASH_INIT_SIZE, dsize); + extsize += ext_size(AHASH_INIT_SIZE, dsize); RCU_INIT_POINTER(hbucket(t, key), m); } else if (m->pos >= m->size) { struct hbucket *ht; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 69d7576be2e6..f4432d9fcad0 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip type */ @@ -27,7 +26,7 @@ #define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 6fe1ec0d2154..7a1734aad0c5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 74ec7e097e34..32e240658334 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ced57d63b01f..15d419353179 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 905f6cf0f55e..7a4d7afd4121 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ @@ -31,7 +30,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 853e772ab4d9..d94c585d33c5 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ @@ -20,7 +19,7 @@ #define IPSET_TYPE_REV_MAX 0 MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 06c91e49bf25..c259cbc3ef45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 0a8cbcdfb42b..87b29f971226 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,iface type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 832e4f5491cb..a3ae69bfee66 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> */ diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index a4f3f15b874a..799f2272cc65 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,port type */ @@ -30,7 +29,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index e54d415405f3..a82b70e8b9a6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8ada318bf09d..6f9ead6319e0 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the list:set type */ @@ -19,7 +18,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index bfd4365a8d73..4515056ef1c2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -358,7 +358,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); @@ -435,7 +435,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7138556b206b..e8651fd621ef 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -34,6 +34,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ +#include <net/gue.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ @@ -892,7 +893,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 @@ -1282,7 +1283,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); - if (!skb_make_writable(skb, iph->len)) + if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ @@ -1574,6 +1575,41 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 1; } +/* Check the UDP tunnel and return its header length */ +static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct udphdr _udph, *udph; + struct ip_vs_dest *dest; + + udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!udph) + goto unk; + offset += sizeof(struct udphdr); + dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + struct guehdr _gueh, *gueh; + + gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); + if (!gueh) + goto unk; + if (gueh->control != 0 || gueh->version != 0) + goto unk; + /* Later we can support also IPPROTO_IPV6 */ + if (gueh->proto_ctype != IPPROTO_IPIP) + goto unk; + *proto = gueh->proto_ctype; + return sizeof(struct udphdr) + sizeof(struct guehdr) + + (gueh->hlen << 2); + } + +unk: + return 0; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1593,6 +1629,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool ipip, new_cp = false; + union nf_inet_addr *raddr; *related = 1; @@ -1631,20 +1668,51 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { + struct ip_vs_dest *dest; + if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; + dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); + /* Only for known tunnel */ + if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) + return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ ipip = true; + } else if (cih->protocol == IPPROTO_UDP && /* Can be UDP encap */ + /* Error for our tunnel must arrive at LOCAL_IN */ + (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { + __u8 iproto; + int ulen; + + /* Non-first fragment has no UDP header */ + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + offset2 = offset + cih->ihl * 4; + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, + &iproto); + if (ulen > 0) { + /* Skip IP and UDP tunnel headers */ + offset = offset2 + ulen; + /* Now we should be at the original IP header */ + cih = skb_header_pointer(skb, offset, sizeof(_ciph), + &_ciph); + if (cih && cih->version == 4 && cih->ihl >= 5 && + iproto == IPPROTO_IPIP) + ipip = true; + else + return NF_ACCEPT; + } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 776c87ed4813..84384d896e29 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -510,15 +510,36 @@ static inline unsigned int ip_vs_rs_hashkey(int af, static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; + __be16 port; if (dest->in_rs_table) return; + switch (IP_VS_DFWD_METHOD(dest)) { + case IP_VS_CONN_F_MASQ: + port = dest->port; + break; + case IP_VS_CONN_F_TUNNEL: + switch (dest->tun_type) { + case IP_VS_CONN_F_TUNNEL_TYPE_GUE: + port = dest->tun_port; + break; + case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + port = 0; + break; + default: + return; + } + break; + default: + return; + } + /* * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; @@ -550,7 +571,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } @@ -580,7 +602,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* Find real service record by <af,addr,tun_port>. + * In case of multiple records with the same <af,addr,tun_port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port) +{ + struct ip_vs_dest *dest; + unsigned int hash; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, tun_port); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->tun_port == tun_port && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } @@ -826,24 +878,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* Need to rehash? */ + if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != + IP_VS_DFWD_METHOD(dest) || + udest->tun_type != dest->tun_type || + udest->tun_port != dest->tun_port) + ip_vs_rs_unhash(dest); + /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { - /* - * Put the real service in rs_table if not present. - * For now only for NAT! - */ - ip_vs_rs_hash(ipvs, dest); /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); + /* Put the real service in rs_table if not present. */ + ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); @@ -2906,6 +2963,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3212,6 +3270,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3332,7 +3392,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh, *nla_tun_type, *nla_tun_port; + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; @@ -3340,6 +3401,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3355,6 +3417,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index c244b2545e24..cf925906f59b 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -267,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { @@ -433,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index b58ddb7dffd1..a0921adc31a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { @@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 915ac8206076..000d961b97e4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -159,7 +159,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { @@ -237,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 379140075e95..153d89647c87 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -148,7 +148,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { @@ -231,7 +231,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index e101eda05d55..71fc6d63a67f 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -36,6 +36,7 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> @@ -275,7 +276,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return false; ipv6_hdr(skb)->hop_limit--; @@ -290,7 +291,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; /* Decrease ttl */ @@ -381,8 +382,13 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -536,8 +542,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -792,7 +803,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -881,7 +892,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1002,17 +1013,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); struct udphdr *udph; /* Our new UDP header */ struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } - skb_push(skb, sizeof(struct guehdr)); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); gueh = (struct guehdr *)skb->data; gueh->control = 0; gueh->version = 0; - gueh->hlen = 0; + gueh->hlen = optlen >> 2; gueh->flags = 0; gueh->proto_ctype = *next_protocol; + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); @@ -1066,6 +1116,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1088,9 +1139,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; @@ -1101,8 +1162,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1111,8 +1181,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1170,6 +1251,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1193,9 +1275,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, @@ -1204,8 +1296,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1214,8 +1315,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -1400,7 +1511,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1489,7 +1600,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index e52fcb9c9a96..921a7b95be68 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -37,12 +37,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (ifa->ifa_broadcast == iph->daddr) { mask = ifa->ifa_mask; break; } - } endfor_ifa(in_dev); + } } if (mask == 0) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f4f9b8344a32..bdfeacee0817 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -749,9 +749,6 @@ begin: continue; } - if (nf_ct_is_dying(ct)) - continue; - if (nf_ct_key_equal(h, tuple, zone, net)) return h; } @@ -777,20 +774,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct; rcu_read_lock(); -begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { + /* We have a candidate that matches the tuple we're interested + * in, try to obtain a reference and re-check tuple + */ ct = nf_ct_tuplehash_to_ctrack(h); - if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) - h = NULL; - else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { - nf_ct_put(ct); - goto begin; - } + if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(nf_ct_key_equal(h, tuple, zone, net))) + goto found; + + /* TYPESAFE_BY_RCU recycled the candidate */ + nf_ct_put(ct); } + + h = NULL; } +found: rcu_read_unlock(); return h; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index fac6986d37a8..6497e5fc0871 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> * * For more information, please see http://nath323.sourceforge.net/ */ diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 37bb530d848f..a0560d175a7f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_bridge.h> #include <net/netfilter/nf_log.h> #include <linux/ip.h> @@ -120,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -static unsigned int nf_confirm(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; @@ -154,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb, /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } +EXPORT_SYMBOL_GPL(nf_confirm); static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, @@ -442,12 +442,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) return 0; } +static struct nf_ct_bridge_info *nf_ct_bridge_info; + static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); - bool fixup_needed = false; + bool fixup_needed = false, retry = true; int err = 0; - +retry: mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { @@ -487,6 +489,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) fixup_needed = true; break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) { + if (!retry) { + err = -EPROTO; + goto out_unlock; + } + mutex_unlock(&nf_ct_proto_mutex); + request_module("nf_conntrack_bridge"); + retry = false; + goto retry; + } + if (!try_module_get(nf_ct_bridge_info->me)) { + err = -EPROTO; + goto out_unlock; + } + cnet->users_bridge++; + if (cnet->users_bridge > 1) + goto out_unlock; + + err = nf_register_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + if (err) + cnet->users_bridge = 0; + else + fixup_needed = true; + break; default: err = -EPROTO; break; @@ -519,47 +547,99 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) ARRAY_SIZE(ipv6_conntrack_ops)); break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) + break; + if (cnet->users_bridge && (--cnet->users_bridge == 0)) + nf_unregister_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + + module_put(nf_ct_bridge_info->me); + break; } - mutex_unlock(&nf_ct_proto_mutex); } -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_inet_get(struct net *net) { int err; - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + return err; err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: return err; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + switch (nfproto) { + case NFPROTO_INET: + err = nf_ct_netns_inet_get(net); + break; + case NFPROTO_BRIDGE: + err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE); + if (err < 0) + return err; + + err = nf_ct_netns_inet_get(net); + if (err < 0) { + nf_ct_netns_put(net, NFPROTO_BRIDGE); + return err; + } + break; + default: + err = nf_ct_netns_do_get(net, nfproto); + break; + } + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); void nf_ct_netns_put(struct net *net, uint8_t nfproto) { - if (nfproto == NFPROTO_INET) { + switch (nfproto) { + case NFPROTO_BRIDGE: + nf_ct_netns_do_put(net, NFPROTO_BRIDGE); + /* fall through */ + case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else { + break; + default: nf_ct_netns_do_put(net, nfproto); + break; } } EXPORT_SYMBOL_GPL(nf_ct_netns_put); +void nf_ct_bridge_register(struct nf_ct_bridge_info *info) +{ + WARN_ON(nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = info; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_register); + +void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info) +{ + WARN_ON(!nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = NULL; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister); + int nf_conntrack_proto_init(void) { int ret; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 522c08c23600..fce3d93f1541 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -336,7 +336,7 @@ static bool sctp_error(struct sk_buff *skb, if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1e2cc83ff5da..d5fdfa00d683 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dc21a43cd145..3066449f8bd8 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; @@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 948b4ebbe3fb..e3d797252a98 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->dst_port = ctt->dst.u.tcp.port; ft->iifidx = other_dst->dev->ifindex; - ft->oifidx = dst->dev->ifindex; ft->dst_cache = dst; } diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 98bf543e9891..a263505455fc 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -95,7 +95,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct tcphdr *tcph; int oldlen, datalen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && @@ -145,7 +145,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, struct udphdr *udph; int datalen, oldlen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 07da07788f6b..888292e8fbb2 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -70,7 +70,7 @@ static bool udp_manip_pkt(struct sk_buff *skb, struct udphdr *hdr; bool do_csum; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -88,7 +88,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -114,7 +114,7 @@ sctp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -155,7 +155,7 @@ tcp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); @@ -195,7 +195,7 @@ dccp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); @@ -229,7 +229,7 @@ icmp_manip_pkt(struct sk_buff *skb, { struct icmphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); @@ -247,7 +247,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, { struct icmp6hdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); @@ -275,7 +275,7 @@ gre_manip_pkt(struct sk_buff *skb, /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; @@ -347,7 +347,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, struct iphdr *iph; unsigned int hdroff; - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; @@ -378,7 +378,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, int hdroff; u8 nexthdr; - if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; @@ -562,7 +562,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) return 0; @@ -784,7 +784,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 4ffe5e5e65ba..f91579c821e9 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -44,15 +44,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, if (hooknum == NF_INET_LOCAL_OUT) { newdst = htonl(0x7F000001); } else { - struct in_device *indev; - struct in_ifaddr *ifa; + const struct in_device *indev; newdst = 0; indev = __in_dev_get_rcu(skb->dev); - if (indev && indev->ifa_list) { - ifa = indev->ifa_list; - newdst = ifa->ifa_local; + if (indev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(indev->ifa_list); + if (ifa) + newdst = ifa->ifa_local; } if (!newdst) diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 7de28fa0f14a..e338d91980d8 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -282,7 +282,7 @@ next: if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { struct udphdr *uh; - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8ce74ed985c0..409722d23302 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,16 +10,16 @@ #include <net/netns/generic.h> #include <linux/proc_fs.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_tcpudp.h> -#include <linux/netfilter/xt_SYNPROXY.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nf_SYNPROXY.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_synproxy.h> unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -57,7 +57,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss = get_unaligned_be16(ptr); - opts->options |= XT_SYNPROXY_OPT_MSS; + opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: @@ -65,19 +65,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); - opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) - opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } @@ -89,36 +89,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, } EXPORT_SYMBOL_GPL(synproxy_parse_options); -unsigned int synproxy_options_size(const struct synproxy_options *opts) +static unsigned int +synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; - if (opts->options & XT_SYNPROXY_OPT_MSS) + if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; - else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) + if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } -EXPORT_SYMBOL_GPL(synproxy_options_size); -void +static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; - if (options & XT_SYNPROXY_OPT_MSS) + if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); - if (options & XT_SYNPROXY_OPT_TIMESTAMP) { - if (options & XT_SYNPROXY_OPT_SACK_PERM) + if (options & NF_SYNPROXY_OPT_TIMESTAMP) { + if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -131,58 +131,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); - } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (options & XT_SYNPROXY_OPT_WSCALE) + if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } -EXPORT_SYMBOL_GPL(synproxy_build_options); -void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_time_stamp_raw() & ~0x3f; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; - if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; - if (opts->options & XT_SYNPROXY_OPT_ECN) + if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); -void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +static void +synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; - opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; - opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; + opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } -EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); -unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *th, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_conn_synproxy *synproxy) +static unsigned int +synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, + struct tcphdr *th, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; @@ -193,7 +191,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { @@ -232,7 +230,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, } return 1; } -EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { .len = sizeof(struct nf_conn_synproxy), @@ -413,5 +410,830 @@ static void __exit synproxy_core_exit(void) module_init(synproxy_core_init); module_exit(synproxy_core_exit); +static struct iphdr * +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack); + +static void +synproxy_send_server_syn(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(struct net *net, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} + +bool +synproxy_recv_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); + +unsigned int +ipv4_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); + +static const struct nf_hook_ops ipv4_synproxy_ops[] = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref4++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); + +void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); + +#if IS_ENABLED(CONFIG_IPV6) +static struct ipv6hdr * +synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp_ipv6(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct dst_entry *dst; + struct flowi6 fl6; + int err; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, + flowi6_to_flowi(&fl6)); + err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (err) { + goto free_nskb; + } + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip6_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); + +static void +synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, + IP_CT_NEW, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, + tcp_hdr_size); +} + +static void +synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} + +bool +synproxy_recv_client_ack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); + +unsigned int +ipv6_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0 || nexthdr != IPPROTO_TCP) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack_ipv6(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); + +static const struct nf_hook_ops ipv6_synproxy_ops[] = { + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref6 == 0) { + err = nf_register_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref6++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); + +void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref6--; + if (snet->hook_ref6 == 0) + nf_unregister_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); +#endif /* CONFIG_IPV6 */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bcf17fb46d96..cae5c46e2dd4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1446,25 +1446,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct net *net, - struct nft_base_chain *chain, - struct nft_stats __percpu *newstats) +static void nft_chain_stats_replace(struct nft_trans *trans) { - struct nft_stats __percpu *oldstats; + struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); - if (newstats == NULL) + if (!nft_trans_chain_stats(trans)) return; - if (rcu_access_pointer(chain->stats)) { - oldstats = rcu_dereference_protected(chain->stats, - lockdep_commit_lock_is_held(net)); - rcu_assign_pointer(chain->stats, newstats); - synchronize_rcu(); - free_percpu(oldstats); - } else { - rcu_assign_pointer(chain->stats, newstats); + rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); + + if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); - } } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) @@ -3877,6 +3870,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, @@ -4330,7 +4324,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, - u64 timeout, gfp_t gfp) + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4345,9 +4339,11 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) - *nft_set_ext_expiration(ext) = - get_jiffies_64() + timeout; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (expiration == 0) + *nft_set_ext_expiration(ext) += timeout; + } if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -4412,6 +4408,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_trans *trans; u32 flags = 0; u64 timeout; + u64 expiration; u8 ulen; int err; @@ -4455,6 +4452,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } + expiration = 0; + if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], + &expiration); + if (err) + return err; + } + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -4537,7 +4544,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, - timeout, GFP_KERNEL); + timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4739,7 +4746,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - GFP_KERNEL); + 0, GFP_KERNEL); if (elem.priv == NULL) goto err2; @@ -6359,9 +6366,9 @@ static void nft_chain_commit_update(struct nft_trans *trans) if (!nft_is_base_chain(trans->ctx.chain)) return; + nft_chain_stats_replace(trans); + basechain = nft_base_chain(trans->ctx.chain); - nft_chain_stats_replace(trans->ctx.net, basechain, - nft_trans_chain_stats(trans)); switch (nft_trans_chain_policy(trans)) { case NF_DROP: @@ -6378,6 +6385,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: + free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f42326b40d6f..9f5dea0064ea 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); + const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) @@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, else if (ip->ttl <= f_ttl) return 1; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } - endfor_ifa(in_dev); - return ret; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 89750f74e3a2..b6a7ce622c72 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -859,7 +859,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) } skb_put(e->skb, diff); } - if (!skb_make_writable(e->skb, data_len)) + if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index dfcdea6619f1..827ab6196df9 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -21,6 +21,7 @@ #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_expect.h> struct nft_ct { enum nft_ct_keys key:8; @@ -1153,6 +1154,135 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .owner = THIS_MODULE, }; +struct nft_ct_expect_obj { + u16 l3num; + __be16 dport; + u8 l4proto; + u8 size; + u32 timeout; +}; + +static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (!tb[NFTA_CT_EXPECT_L4PROTO] || + !tb[NFTA_CT_EXPECT_DPORT] || + !tb[NFTA_CT_EXPECT_TIMEOUT] || + !tb[NFTA_CT_EXPECT_SIZE]) + return -EINVAL; + + priv->l3num = ctx->family; + if (tb[NFTA_CT_EXPECT_L3PROTO]) + priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); + priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); + priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_ct_expect_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || + nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || + nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || + nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || + nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) + return -1; + + return 0; +} + +static void nft_ct_expect_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + struct nf_conntrack_expect *exp; + enum ip_conntrack_info ctinfo; + struct nf_conn_help *help; + enum ip_conntrack_dir dir; + u16 l3num = priv->l3num; + struct nf_conn *ct; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct || ctinfo == IP_CT_UNTRACKED) { + regs->verdict.code = NFT_BREAK; + return; + } + dir = CTINFO2DIR(ctinfo); + + help = nfct_help(ct); + if (!help) + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (!help) { + regs->verdict.code = NF_DROP; + return; + } + + if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { + regs->verdict.code = NFT_BREAK; + return; + } + if (l3num == NFPROTO_INET) + l3num = nf_ct_l3num(ct); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + regs->verdict.code = NF_DROP; + return; + } + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + priv->l4proto, NULL, &priv->dport); + exp->timeout.expires = jiffies + priv->timeout * HZ; + + if (nf_ct_expect_related(exp) != 0) + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { + [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, + [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_expect_obj_type; + +static const struct nft_object_ops nft_ct_expect_obj_ops = { + .type = &nft_ct_expect_obj_type, + .size = sizeof(struct nft_ct_expect_obj), + .eval = nft_ct_expect_obj_eval, + .init = nft_ct_expect_obj_init, + .destroy = nft_ct_expect_obj_destroy, + .dump = nft_ct_expect_obj_dump, +}; + +static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_EXPECT, + .ops = &nft_ct_expect_obj_ops, + .maxattr = NFTA_CT_EXPECT_MAX, + .policy = nft_ct_expect_policy, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -1170,17 +1300,23 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; + + err = nft_register_obj(&nft_ct_expect_obj_type); + if (err < 0) + goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) - goto err3; + goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err4: + nft_unregister_obj(&nft_ct_expect_obj_type); +#endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); -#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -1193,6 +1329,7 @@ static void __exit nft_ct_module_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif + nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -1207,3 +1344,4 @@ MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 505bdfc66801..33833a0cb989 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -56,7 +56,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, elem = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], - timeout, GFP_ATOMIC); + timeout, 0, GFP_ATOMIC); if (elem == NULL) goto err1; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a7aa6c5250a4..a5e8469859e3 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,6 +59,103 @@ err: regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, + unsigned int *offset, int target) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + bool found = false; + __be32 info; + int optlen; + + iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!iph) + return -EBADMSG; + start = sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies + * the options. + */ + if (skb_copy_bits(skb, start, opt->__data, optlen)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (!opt->srr) + break; + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; + if (found) + *offset = opt->srr + start; + break; + case IPOPT_RR: + if (!opt->rr) + break; + *offset = opt->rr + start; + found = true; + break; + case IPOPT_RA: + if (!opt->router_alert) + break; + *offset = opt->router_alert + start; + found = true; + break; + default: + return -EOPNOTSUPP; + } + return found ? target : -ENOENT; +} + +static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + unsigned int offset; + int err; + + if (skb->protocol != htons(ETH_P_IP)) + goto err; + + err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { + goto err; + } + offset += priv->offset; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + goto err; + return; +err: + regs->verdict.code = NFT_BREAK; +} + static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) @@ -153,7 +250,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) return; - if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + if (skb_ensure_writable(pkt->skb, + pkt->xt.thoff + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -311,6 +409,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg, priv->len); } +static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + switch (priv->type) { + case IPOPT_SSRR: + case IPOPT_LSRR: + case IPOPT_RR: + case IPOPT_RA: + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -357,6 +477,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_ipv4_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_ipv4_eval, + .init = nft_exthdr_ipv4_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -397,6 +525,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; + case NFT_EXTHDR_OP_IPV4: + if (ctx->family != NFPROTO_IPV6) { + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv4_ops; + } + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 680bd9f38a81..1260f78a034d 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -240,7 +240,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, tsum)); } - if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) || skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -256,7 +256,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return -1; nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -309,7 +309,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, goto err; } - if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + if (skb_ensure_writable(skb, max(offset + priv->len, 0)) || skb_store_bits(skb, offset, src, priv->len) < 0) goto err; diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index b1054a3d18c5..eababc354ff1 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -31,7 +31,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), @@ -49,7 +49,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), @@ -79,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 8221a5ce44bf..7873b834c300 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -29,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ipt_TTL_info *info = par->targinfo; int new_ttl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); @@ -69,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) const struct ip6t_HL_info *info = par->targinfo; int new_hl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 0b3a1b291c91..122db9fbb9f4 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (par->fragoff != 0) return 0; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 666f4ca9b15f..30e99464171b 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -28,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, - unsigned int tcphoff, unsigned int minlen) + unsigned int tcphoff) { const struct xt_tcpoptstrip_target_info *info = par->targinfo; + struct tcphdr *tcph, _th; unsigned int optl, i, j; - struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len, tcp_hdrlen; + int tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return XT_CONTINUE; - if (!skb_make_writable(skb, skb->len)) + tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th); + if (!tcph) return NF_DROP; - len = skb->len - tcphoff; - if (len < (int)sizeof(struct tcphdr)) - return NF_DROP; - - tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; + if (tcp_hdrlen < sizeof(struct tcphdr)) + return NF_DROP; - if (len < tcp_hdrlen) + if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen)) return NF_DROP; - opt = (u_int8_t *)tcph; + /* must reload tcph, might have been moved */ + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u8 *)tcph; /* * Walk through all TCP options - if we find some option to remove, @@ -88,8 +88,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), - sizeof(struct iphdr) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } #if IS_ENABLED(CONFIG_IP6_NF_MANGLE) @@ -106,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par, tcphoff, - sizeof(*ipv6h) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, tcphoff); } #endif diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index 140ce6be639a..0c9e014e30b4 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -2,7 +2,7 @@ /* * xt_iprange - Netfilter module to match IP address ranges * - * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) CC Computer Consultants GmbH, 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -130,7 +130,7 @@ static void __exit iprange_mt_exit(void) module_init(iprange_mt_init); module_exit(iprange_mt_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); MODULE_ALIAS("ipt_iprange"); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 95f64a99e425..e85ce69924ae 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -22,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par) struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; + if (info->match & ~XT_OWNER_MASK) + return -EINVAL; + /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ @@ -88,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index f099228cb9c4..ecbfa291fb70 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -2,7 +2,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module which implements the set match and SET target @@ -18,7 +18,7 @@ #include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("Xtables: IP set match and target module"); MODULE_ALIAS("xt_SET"); MODULE_ALIAS("ipt_set"); @@ -436,6 +436,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; ip_set_id_t index; + int ret = 0; if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, @@ -453,17 +454,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find del_set index %u as target\n", info->del_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_add; } } if (info->map_set.index != IPSET_INVALID_ID) { if (strncmp(par->table, "mangle", 7)) { pr_info_ratelimited("--map-set only usable from mangle table\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && @@ -471,20 +471,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find map_set index %u as target\n", info->map_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->del_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_del; } } @@ -492,16 +488,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { pr_info_ratelimited("SET target dimension over the limit!\n"); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->del_set.index); - if (info->map_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->map_set.index); - return -ERANGE; + ret = -ERANGE; + goto cleanup_mark; } return 0; +cleanup_mark: + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +cleanup_del: + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +cleanup_add: + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return ret; } static void diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e9ddfd782d16..90b2ab9dd449 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -241,13 +241,8 @@ static __net_init int netlink_tap_init_net(struct net *net) return 0; } -static void __net_exit netlink_tap_exit_net(struct net *net) -{ -} - static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, - .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; @@ -2544,12 +2539,10 @@ struct nl_seq_iter { int link; }; -static int netlink_walk_start(struct nl_seq_iter *iter) +static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); - - return 0; } static void netlink_walk_stop(struct nl_seq_iter *iter) @@ -2565,8 +2558,6 @@ static void *__netlink_seq_next(struct seq_file *seq) do { for (;;) { - int err; - nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { @@ -2583,9 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq) if (++iter->link >= MAX_LINKS) return NULL; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); @@ -2597,13 +2586,10 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; - int err; iter->link = 0; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6747bc57b6fa..33b388103741 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1334,7 +1334,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { - if (likely(!IS_ERR(reply))) { + if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f927de9bda0a..3fc38d16c456 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -248,8 +248,6 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) */ void ovs_vport_del(struct vport *vport) { - ASSERT_OVSL(); - hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5f78df080573..8d54f3047768 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) smp_wmb(); } -static int __packet_get_status(struct packet_sock *po, void *frame) +static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; @@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, return ts_status; } -static void *packet_lookup_frame(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int position, - int status) +static void *packet_lookup_frame(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int position, + int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; @@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; - if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -1003,7 +1003,6 @@ static void prb_fill_curr_block(char *curr, /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, - int status, unsigned int len ) { @@ -1075,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po, po->rx_ring.head, status); return curr; case TPACKET_V3: - return __packet_lookup_frame_in_block(po, skb, status, len); + return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); @@ -1083,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po, } } -static void *prb_lookup_block(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int idx, - int status) +static void *prb_lookup_block(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int idx, + int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); @@ -1199,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po) #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 -static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.frame_max + 1; - idx = po->rx_ring.head; + len = READ_ONCE(po->rx_ring.frame_max) + 1; + idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1212,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off) return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.prb_bdqc.knum_blocks; - idx = po->rx_ring.prb_bdqc.kactive_blk_num; + len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); + idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1225,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(const struct packet_sock *po, + const struct sk_buff *skb) { - struct sock *sk = &po->sk; + const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { - int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - (skb ? skb->truesize : 0); - if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + + if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; @@ -1258,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int ret; - bool has_room; + int pressure, ret; - spin_lock_bh(&po->sk.sk_receive_queue.lock); ret = __packet_rcv_has_room(po, skb); - has_room = ret == ROOM_NORMAL; - if (po->pressure == has_room) - po->pressure = !has_room; - spin_unlock_bh(&po->sk.sk_receive_queue.lock); + pressure = ret != ROOM_NORMAL; + + if (READ_ONCE(po->pressure) != pressure) + WRITE_ONCE(po->pressure, pressure); return ret; } +static void packet_rcv_try_clear_pressure(struct packet_sock *po) +{ + if (READ_ONCE(po->pressure) && + __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); +} + static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); @@ -1351,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(f->arr[i]); - if (po_next != po_skip && !po_next->pressure && + if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -2126,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2193,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!res) goto drop_n_restore; + /* If we are flooded, just give up */ + if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && @@ -2263,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } @@ -2379,9 +2390,9 @@ drop: return 0; drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -3318,8 +3329,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; - if (pkt_sk(sk)->pressure) - packet_rcv_has_room(pkt_sk(sk), NULL); + packet_rcv_try_clear_pressure(pkt_sk(sk)); if (pkt_sk(sk)->has_vnet_hdr) { err = packet_rcv_vnet(msg, skb, &len); @@ -3891,6 +3901,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3907,14 +3918,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; } @@ -4133,8 +4147,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - po->pressure = 0; + packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { diff --git a/net/packet/internal.h b/net/packet/internal.h index c70a2794456f..82fb2b10f790 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -132,6 +132,7 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; + atomic_t tp_drops ____cacheline_aligned_in_smp; }; static struct packet_sock *pkt_sk(struct sock *sk) diff --git a/net/rds/ib.c b/net/rds/ib.c index b8d581b779b2..ec05d91aa9a2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); + iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } @@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); + iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2c72d95c3050..360fdd3eaa77 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -877,6 +877,23 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_CTINFO + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help + Say Y here to allow transfer of a connmark stored information. + Current actions transfer connmark stored DSCP into + ipv4/v6 diffserv and/or to transfer connmark to packet + mark. Both are useful for restoring egress based marks + back onto ingress connections for qdisc priority mapping + purposes. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ctinfo. + config NET_ACT_SKBMOD tristate "skb data modification action" depends on NET_CLS_ACT @@ -924,14 +941,6 @@ config NET_IFE_SKBTCINDEX tristate "Support to encoding decoding skb tcindex on IFE action" depends on NET_ACT_IFE -config NET_CLS_IND - bool "Incoming device classification" - depends on NET_CLS_U32 || NET_CLS_FW - ---help--- - Say Y here to extend the u32 and fw classifier to support - classification based on the incoming device. This option is - likely to disappear in favour of the metadata ematch. - endif # NET_SCHED config NET_SCH_FIFO diff --git a/net/sched/Makefile b/net/sched/Makefile index 8a40431d7b5c..d54bfcbd7981 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c new file mode 100644 index 000000000000..10eb2bb99861 --- /dev/null +++ b/net/sched/act_ctinfo.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions + * + * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_cls.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> +#include <uapi/linux/tc_act/tc_ctinfo.h> +#include <net/tc_act/tc_ctinfo.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> + +static struct tc_action_ops act_ctinfo_ops; +static unsigned int ctinfo_net_id; + +static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb, int wlen, int proto) +{ + u8 dscp, newdscp; + + newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + ~INET_ECN_MASK; + + switch (proto) { + case NFPROTO_IPV4: + dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv4_change_dsfield(ip_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + case NFPROTO_IPV6: + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv6_change_dsfield(ipv6_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + default: + break; + } +} + +static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb) +{ + ca->stats_cpmark_set++; + skb->mark = ct->mark & cp->cpmarkmask; +} + +static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + const struct nf_conntrack_tuple_hash *thash = NULL; + struct tcf_ctinfo *ca = to_ctinfo(a); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; + enum ip_conntrack_info ctinfo; + struct tcf_ctinfo_params *cp; + struct nf_conn *ct; + int proto, wlen; + int action; + + cp = rcu_dereference_bh(ca->params); + + tcf_lastuse_update(&ca->tcf_tm); + bstats_update(&ca->tcf_bstats, skb); + action = READ_ONCE(ca->tcf_action); + + wlen = skb_network_offset(skb); + if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV4; + } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV6; + } else { + goto out; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { /* look harder, usually ingress */ + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + proto, cp->net, &tuple)) + goto out; + zone.id = cp->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(cp->net, &zone, &tuple); + if (!thash) + goto out; + + ct = nf_ct_tuplehash_to_ctrack(thash); + } + + if (cp->mode & CTINFO_MODE_DSCP) + if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); + + if (cp->mode & CTINFO_MODE_CPMARK) + tcf_ctinfo_cpmark_set(ct, ca, cp, skb); + + if (thash) + nf_ct_put(ct); +out: + return action; +} + +static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { + [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct + tc_ctinfo) }, + [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, + [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, +}; + +static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct nlattr *tb[TCA_CTINFO_MAX + 1]; + struct tcf_ctinfo_params *cp_new; + struct tcf_chain *goto_ch = NULL; + u32 dscpmask = 0, dscpstatemask; + struct tc_ctinfo *actparm; + struct tcf_ctinfo *ci; + u8 dscpmaskshift; + int ret = 0, err; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_CTINFO_ACT]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing required TCA_CTINFO_ACT attribute"); + return -EINVAL; + } + actparm = nla_data(tb[TCA_CTINFO_ACT]); + + /* do some basic validation here before dynamically allocating things */ + /* that we would otherwise have to clean up. */ + if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { + dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); + /* need contiguous 6 bit mask */ + dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; + if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_MASK], + "dscp mask must be 6 contiguous bits"); + return -EINVAL; + } + dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? + nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + /* mask & statemask must not overlap */ + if (dscpmask & dscpstatemask) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], + "dscp statemask must not overlap dscp mask"); + return -EINVAL; + } + } + + /* done the validation:now to the actual action allocation */ + err = tcf_idr_check_alloc(tn, &actparm->index, a, bind); + if (!err) { + ret = tcf_idr_create(tn, actparm->index, est, a, + &act_ctinfo_ops, bind, false); + if (ret) { + tcf_idr_cleanup(tn, actparm->index); + return ret; + } + ret = ACT_P_CREATED; + } else if (err > 0) { + if (bind) /* don't override defaults */ + return 0; + if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } else { + return err; + } + + err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + ci = to_ctinfo(*a); + + cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); + if (unlikely(!cp_new)) { + err = -ENOMEM; + goto put_chain; + } + + cp_new->net = net; + cp_new->zone = tb[TCA_CTINFO_ZONE] ? + nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + if (dscpmask) { + cp_new->dscpmask = dscpmask; + cp_new->dscpmaskshift = dscpmaskshift; + cp_new->dscpstatemask = dscpstatemask; + cp_new->mode |= CTINFO_MODE_DSCP; + } + + if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { + cp_new->cpmarkmask = + nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); + cp_new->mode |= CTINFO_MODE_CPMARK; + } + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); + rcu_swap_protected(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (cp_new) + kfree_rcu(cp_new, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tc_ctinfo opt = { + .index = ci->tcf_index, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, + }; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ctinfo_params *cp; + struct tcf_t t; + + spin_lock_bh(&ci->tcf_lock); + cp = rcu_dereference_protected(ci->params, + lockdep_is_held(&ci->tcf_lock)); + + tcf_tm_dump(&t, &ci->tcf_tm); + if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) + goto nla_put_failure; + + opt.action = ci->tcf_action; + if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) + goto nla_put_failure; + + if (cp->mode & CTINFO_MODE_DSCP) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, + cp->dscpmask)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, + cp->dscpstatemask)) + goto nla_put_failure; + } + + if (cp->mode & CTINFO_MODE_CPMARK) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, + cp->cpmarkmask)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, + ci->stats_dscp_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, + ci->stats_dscp_error, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, + ci->stats_cpmark_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&ci->tcf_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(&ci->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_ctinfo_ops = { + .kind = "ctinfo", + .id = TCA_ID_CTINFO, + .owner = THIS_MODULE, + .act = tcf_ctinfo_act, + .dump = tcf_ctinfo_dump, + .init = tcf_ctinfo_init, + .walk = tcf_ctinfo_walker, + .lookup = tcf_ctinfo_search, + .size = sizeof(struct tcf_ctinfo), +}; + +static __net_init int ctinfo_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tc_action_net_init(tn, &act_ctinfo_ops); +} + +static void __net_exit ctinfo_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, ctinfo_net_id); +} + +static struct pernet_operations ctinfo_net_ops = { + .init = ctinfo_init_net, + .exit_batch = ctinfo_exit_net, + .id = &ctinfo_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ctinfo_init_module(void) +{ + return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +static void __exit ctinfo_cleanup_module(void) +{ + tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +module_init(ctinfo_init_module); +module_exit(ctinfo_cleanup_module); +MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); +MODULE_DESCRIPTION("Connection tracking mark actions"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index eedd5786c084..ce2e9b1c9850 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -27,7 +27,7 @@ #include <net/dst_metadata.h> struct fl_flow_key { - int indev_ifindex; + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; @@ -284,7 +284,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, list_for_each_entry_rcu(mask, &head->masks, list) { fl_clear_masked_range(&skb_key, mask); - skb_key.indev_ifindex = skb->skb_iif; + skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ @@ -1021,15 +1021,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, { __be16 ethertype; int ret = 0; -#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) return err; - key->indev_ifindex = err; - mask->indev_ifindex = 0xffffffff; + key->meta.ingress_ifindex = err; + mask->meta.ingress_ifindex = 0xffffffff; } -#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, @@ -1282,6 +1281,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_META, meta); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -2123,10 +2124,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb, static int fl_dump_key(struct sk_buff *skb, struct net *net, struct fl_flow_key *key, struct fl_flow_key *mask) { - if (mask->indev_ifindex) { + if (mask->meta.ingress_ifindex) { struct net_device *dev; - dev = __dev_get_by_index(net, key->indev_ifindex); + dev = __dev_get_by_index(net, key->meta.ingress_ifindex); if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) goto nla_put_failure; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4dab833f66cb..c9496c920d6f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -8,9 +8,6 @@ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension - * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available */ #include <linux/module.h> @@ -37,9 +34,7 @@ struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; @@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, f->ifindex)) continue; -#endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; @@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, return ret; f->ifindex = ret; } -#endif /* CONFIG_NET_CLS_IND */ err = -EINVAL; if (tb[TCA_FW_MASK]) { @@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, fnew->id = f->id; fnew->res = f->res; -#ifdef CONFIG_NET_CLS_IND fnew->ifindex = f->ifindex; -#endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, @@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } -#endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 38c0a9f0f296..a30d2f8feb32 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { unsigned int in_hw_count; struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; + bool deleting; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -258,7 +259,11 @@ err_exts_init: static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { - return -EOPNOTSUPP; + struct cls_mall_head *head = rtnl_dereference(tp->root); + + head->deleting = true; + *last = true; + return 0; } static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, @@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, if (arg->count < arg->skip) goto skip; - if (!head) + if (!head || head->deleting) return; if (arg->fn(tp, head, arg) < 0) arg->stop = 1; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7727de5e073..be9e46c77e8b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -20,9 +20,6 @@ * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available - * * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ @@ -48,9 +45,7 @@ struct tc_u_knode { u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; @@ -176,12 +171,10 @@ check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } -#endif #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif @@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &n->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); @@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; n->ifindex = ret; } -#endif return 0; } @@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); -#ifdef CONFIG_NET_CLS_IND new->ifindex = n->ifindex; -#endif new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; @@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } -#endif #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(sizeof(struct tc_u32_pcnt) + n->sel.nkeys * sizeof(u64), @@ -1422,9 +1409,7 @@ static int __init init_u32(void) #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif -#ifdef CONFIG_NET_CLS_IND pr_info(" input device check on\n"); -#endif #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 0f65f617756b..599730f804d7 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -114,6 +114,7 @@ nla_put_failure: } static const struct Qdisc_class_ops ingress_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = ingress_find, .walk = ingress_walk, @@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch) } static const struct Qdisc_class_ops clsact_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = clsact_find, .walk = ingress_walk, diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 2cae7440349c..74847d613835 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -94,11 +94,6 @@ static const struct net_offload sctp6_offload = { }, }; -static const struct skb_checksum_ops crc32c_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; - int __init sctp_offload_init(void) { int ret; @@ -111,7 +106,7 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &crc32c_csum_ops; + crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 23af232c0a25..2d47adcb4cbe 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -81,7 +81,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, return; } - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7621ec2f539c..302e355f2ebc 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -123,30 +123,11 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); -static int smc_release(struct socket *sock) +static int __smc_release(struct smc_sock *smc) { - struct sock *sk = sock->sk; - struct smc_sock *smc; + struct sock *sk = &smc->sk; int rc = 0; - if (!sk) - goto out; - - smc = smc_sk(sk); - - /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) - tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); - - if (sk->sk_state == SMC_LISTEN) - /* smc_close_non_accepted() is called and acquires - * sock lock for child sockets again - */ - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - else - lock_sock(sk); - if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); @@ -174,6 +155,35 @@ static int smc_release(struct socket *sock) smc_conn_free(&smc->conn); } + return rc; +} + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + flush_work(&smc->connect_work); + + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + rc = __smc_release(smc); + /* detach socket */ sock_orphan(sk); sock->sk = NULL; @@ -964,26 +974,7 @@ void smc_close_non_accepted(struct sock *sk) if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) { - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } - sk->sk_prot->unhash(sk); - if (smc->clcsock) { - struct socket *tcp; - - tcp = smc->clcsock; - smc->clcsock = NULL; - sock_release(tcp); - } - if (smc->use_fallback) { - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - } else { - if (sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); - } + __smc_release(smc); release_sock(sk); sock_put(sk); /* final sock_put */ } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 745afd82f281..49bcebff6378 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } @@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } diff --git a/net/socket.c b/net/socket.c index 38eec1583f6d..963df5dbdd54 100644 --- a/net/socket.c +++ b/net/socket.c @@ -429,7 +429,7 @@ static int sock_map_fd(struct socket *sock, int flags) } newfile = sock_alloc_file(sock, flags, NULL); - if (likely(!IS_ERR(newfile))) { + if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index f64f36a83063..b3815c1e8f2e 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -157,18 +157,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, return 0; } - skb = alloc_skb(0, GFP_ATOMIC); + skb = alloc_skb_for_msg(head); if (!skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *_strp_msg(skb) = *_strp_msg(head); + strp->skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; strp->skb_head = skb; head = skb; } else { diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6c997d4a6218..1336f3cdad38 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -323,7 +323,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); if (msg_type(hdr) != TIPC_MCAST_MSG) return 0; @@ -392,7 +392,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, skb = skb_peek(pkts); hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ diff --git a/net/tipc/link.c b/net/tipc/link.c index 2050fd386642..f8bf63befe1f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -107,7 +107,6 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @prev_from: sequence number of most previous retransmission request - * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. @@ -167,7 +166,6 @@ struct tipc_link { u16 snd_nxt; u16 prev_from; u16 window; - u16 stale_cnt; unsigned long stale_limit; /* Reception */ @@ -249,9 +247,9 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq); /* * Simple non-static link routines (i.e. referenced outside this file) @@ -734,7 +732,7 @@ static void link_profile_stats(struct tipc_link *l) if (msg_user(msg) == MSG_FRAGMENTER) { if (msg_type(msg) != FIRST_FRAGMENT) return; - length = msg_size(msg_get_wrapped(msg)); + length = msg_size(msg_inner_hdr(msg)); } l->stats.msg_lengths_total += length; l->stats.msg_length_counts++; @@ -910,7 +908,6 @@ void tipc_link_reset(struct tipc_link *l) l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -1044,32 +1041,68 @@ static void tipc_link_advance_backlog(struct tipc_link *l, l->snd_nxt = seqno; } -static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) +/** + * link_retransmit_failure() - Detect repeated retransmit failures + * @l: tipc link sender + * @r: tipc link receiver (= l in case of unicast) + * @from: seqno of the 1st packet in retransmit request + * @rc: returned code + * + * Return: true if the repeated retransmit failures happens, otherwise + * false + */ +static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, + u16 from, int *rc) { - struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + + if (!skb) + return false; + hdr = buf_msg(skb); + + /* Detect repeated retransmit failures on same packet */ + if (r->prev_from != from) { + r->prev_from = from; + r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + } else if (time_after(jiffies, r->stale_limit)) { + pr_warn("Retransmission failure on link <%s>\n", l->name); + link_print(l, "State of link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(hdr), msg_type(hdr), msg_size(hdr), + msg_errcode(hdr)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + + trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); + + if (link_is_bc_sndlink(l)) + *rc = TIPC_LINK_DOWN_EVT; + + *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + return true; + } - pr_warn("Retransmission failure on link <%s>\n", l->name); - link_print(l, "State of link "); - pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", - msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); - pr_info("sqno %u, prev: %x, src: %x\n", - msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + return false; } -/* tipc_link_retrans() - retransmit one or more packets +/* tipc_link_bc_retrans() - retransmit zero or more packets * @l: the link to transmit on * @r: the receiving link ordering the retransmit. Same as l if unicast * @from: retransmit from (inclusive) this sequence number * @to: retransmit to (inclusive) this sequence number * xmitq: queue for accumulating the retransmitted packets */ -static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; + int rc = 0; if (!skb) return 0; @@ -1077,20 +1110,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, return 0; trace_tipc_link_retrans(r, from, to, &l->transmq); - /* Detect repeated retransmit failures on same packet */ - if (r->prev_from != from) { - r->prev_from = from; - r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); - r->stale_cnt = 0; - } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { - link_retransmit_failure(l, skb); - trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); - trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); - trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); - if (link_is_bc_sndlink(l)) - return TIPC_LINK_DOWN_EVT; - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - } + + if (link_retransmit_failure(l, r, from, &rc)) + return rc; skb_queue_walk(&l->transmq, skb) { hdr = buf_msg(skb); @@ -1103,7 +1125,7 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, continue; TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; } - _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1324,17 +1346,23 @@ exit: * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any + * + * In case of a repeated retransmit failures, the call will return shortly + * with a returned code (e.g. TIPC_LINK_DOWN_EVT) */ -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq) { struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; - u16 seqno; - u16 n = 0; + u16 seqno, n = 0; + int rc = 0; + + if (gap && link_retransmit_failure(l, l, acked + 1, &rc)) + return rc; skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); @@ -1369,6 +1397,8 @@ next_gap_ack: goto next_gap_ack; } } + + return 0; } /* tipc_link_build_state_msg: prepare link state message for transmission @@ -1481,7 +1511,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1918,7 +1947,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - tipc_link_advance_transmq(l, ack, gap, ga, xmitq); + rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); /* If NACK, retransmit will now start at right position */ if (gap) @@ -2035,7 +2064,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) @@ -2131,7 +2160,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); - rc = tipc_link_retrans(l->bc_sndlink, l, from, to, xmitq); + rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq); l->stats.recv_nacks++; return rc; } @@ -2550,7 +2579,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); i += scnprintf(buf + i, sz - i, " %u", l->prev_from); - i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt); + i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", l->acked); list = &l->transmq; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 8de02ad6e352..da509f0eb9ca 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -308,7 +308,7 @@ static inline unchar *msg_data(struct tipc_msg *m) return ((unchar *)m) + msg_hdr_sz(m); } -static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } @@ -486,7 +486,7 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) - m = msg_get_wrapped(m); + m = msg_inner_hdr(m); return msg_word(m, 4); } diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 99bd166bccec..d6165ad384c0 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -261,7 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = { .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, .maxattr = TIPC_NLA_MAX, - .policy = tipc_nl_policy, + .policy = tipc_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tipc_genl_v2_ops, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index cf155061c472..d86030ef1232 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -691,7 +691,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *media; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -699,10 +698,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, if (!media) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; @@ -723,7 +718,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *bearer; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -731,10 +725,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, if (!bearer) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; diff --git a/net/tipc/node.c b/net/tipc/node.c index 550581d47d51..324a1f91b394 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1649,7 +1649,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); - u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); + u16 iseqno = msg_seqno(msg_inner_hdr(hdr)); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1f9cf57d9754..40076f423dcb 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -209,6 +209,29 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, + u32 seq) +{ + struct net_device *netdev; + struct sk_buff *skb; + u8 *rcd_sn; + + skb = tcp_write_queue_tail(sk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + rcd_sn = tls_ctx->tx.rec_seq; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (netdev) + netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, + TLS_OFFLOAD_CTX_DIR_TX); + up_read(&device_offload_lock); + + clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); +} + static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) @@ -252,7 +275,7 @@ static int tls_push_record(struct sock *sk, skb_frag_address(frag), record->len - prot->prepend_size, record_type, - ctx->crypto_send.info.version); + prot->version); /* HW doesn't care about the data in the tag, because it fills it. */ dummy_tag_frag.page = skb_frag_page(frag); @@ -264,7 +287,11 @@ static int tls_push_record(struct sock *sk, list_add_tail(&record->list, &offload_ctx->records_list); spin_unlock_irq(&offload_ctx->lock); offload_ctx->open_record = NULL; - tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version); + + if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) + tls_device_resync_tx(sk, ctx, tp->write_seq); + + tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; @@ -551,7 +578,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) } static void tls_device_resync_rx(struct tls_context *tls_ctx, - struct sock *sk, u32 seq, u64 rcd_sn) + struct sock *sk, u32 seq, u8 *rcd_sn) { struct net_device *netdev; @@ -559,14 +586,17 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, return; netdev = READ_ONCE(tls_ctx->netdev); if (netdev) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, + TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); } -void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + struct tls_prot_info *prot; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -574,15 +604,83 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) if (tls_ctx->rx_conf != TLS_HW) return; + prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); - resync_req = atomic64_read(&rx_ctx->resync_req); - req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); - is_req_pending = resync_req; + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); - if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + switch (rx_ctx->resync_type) { + case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + is_req_pending = resync_req; + + if (likely(!is_req_pending) || req_seq != seq || + !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + return; + break; + case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: + if (likely(!rx_ctx->resync_nh_do_now)) + return; + + /* head of next rec is already in, note that the sock_inq will + * include the currently parsed message when called from parser + */ + if (tcp_inq(sk) > rcd_len) + return; + + rx_ctx->resync_nh_do_now = 0; + seq += rcd_len; + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + break; + } + + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); +} + +static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, + struct tls_offload_context_rx *ctx, + struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm; + + /* device will request resyncs by itself based on stream scan */ + if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) + return; + /* already scheduled */ + if (ctx->resync_nh_do_now) + return; + /* seen decrypted fragments since last fully-failed record */ + if (ctx->resync_nh_reset) { + ctx->resync_nh_reset = 0; + ctx->resync_nh.decrypted_failed = 1; + ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; + return; + } + + if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) + return; + + /* doing resync, bump the next target in case it fails */ + if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) + ctx->resync_nh.decrypted_tgt *= 2; + else + ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; + + rxm = strp_msg(skb); + + /* head of next rec is already in, parser will sync for us */ + if (tcp_inq(sk) > rxm->full_len) { + ctx->resync_nh_do_now = 1; + } else { + struct tls_prot_info *prot = &tls_ctx->prot_info; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + + tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, + rcd_sn); } } @@ -610,8 +708,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); - skb_copy_bits(skb, offset, buf, - TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + err = skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + if (err) + goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, skb, sg); @@ -625,8 +725,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb->decrypted) { + err = skb_store_bits(skb, offset, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -649,8 +752,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); - if (skb_iter->decrypted) - skb_store_bits(skb_iter, frag_pos, buf, copy); + if (skb_iter->decrypted) { + err = skb_store_bits(skb_iter, frag_pos, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -671,10 +777,6 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) int is_encrypted = !is_decrypted; struct sk_buff *skb_iter; - /* Skip if it is already decrypted */ - if (ctx->sw.decrypted) - return 0; - /* Check if all the data is decrypted already */ skb_walk_frags(skb, skb_iter) { is_decrypted &= skb_iter->decrypted; @@ -683,12 +785,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) ctx->sw.decrypted |= is_decrypted; - /* Return immedeatly if the record is either entirely plaintext or + /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ - return (is_encrypted || is_decrypted) ? 0 : - tls_device_reencrypt(sk, skb); + if (is_decrypted) { + ctx->resync_nh_reset = 1; + return 0; + } + if (is_encrypted) { + tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); + return 0; + } + + ctx->resync_nh_reset = 1; + return tls_device_reencrypt(sk, skb); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, @@ -757,6 +868,12 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_offload_ctx; } + /* Sanity-check the rec_seq_size for stack allocations */ + if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { + rc = -EINVAL; + goto free_offload_ctx; + } + prot->prepend_size = TLS_HEADER_SIZE + nonce_size; prot->tag_size = tag_size; prot->overhead_size = prot->prepend_size + prot->tag_size; @@ -908,6 +1025,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = -ENOMEM; goto release_netdev; } + context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, ctx, 0); @@ -1015,7 +1133,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if ((dev->features & NETIF_F_HW_TLS_RX) && - !dev->tlsdev_ops->tls_dev_resync_rx) + !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index c3a5fe624b4e..1d2d804ac633 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -240,7 +240,6 @@ static int fill_sg_in(struct scatterlist *sg_in, record = tls_get_record(ctx, tcp_seq, rcd_sn); if (!record) { spin_unlock_irqrestore(&ctx->lock, flags); - WARN(1, "Record not found for seq %u\n", tcp_seq); return -EINVAL; } @@ -409,7 +408,10 @@ put_sg: put_page(sg_page(&sg_in[--resync_sgs])); kfree(sg_in); free_orig: - kfree_skb(skb); + if (nskb) + consume_skb(skb); + else + kfree_skb(skb); return nskb; } @@ -424,6 +426,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); +struct sk_buff *tls_encrypt_skb(struct sk_buff *skb) +{ + return tls_sw_fallback(skb->sk, skb); +} +EXPORT_SYMBOL_GPL(tls_encrypt_skb); + int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 455a782c7658..db585964b52b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk, /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; - tls_advance_record_sn(sk, &tls_ctx->tx, prot->version); + tls_advance_record_sn(sk, prot, &tls_ctx->tx); return rc; } @@ -1485,15 +1485,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; - int version = prot->version; struct strp_msg *rxm = strp_msg(skb); int pad, err = 0; if (!ctx->decrypted) { #ifdef CONFIG_TLS_DEVICE - err = tls_device_decrypted(sk, skb); - if (err < 0) - return err; + if (tls_ctx->rx_conf == TLS_HW) { + err = tls_device_decrypted(sk, skb); + if (err < 0) + return err; + } #endif /* Still not decrypted after tls_device */ if (!ctx->decrypted) { @@ -1501,8 +1502,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, async); if (err < 0) { if (err == -EINPROGRESS) - tls_advance_record_sn(sk, &tls_ctx->rx, - version); + tls_advance_record_sn(sk, prot, + &tls_ctx->rx); return err; } @@ -1517,7 +1518,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->full_len -= pad; rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx, version); + tls_advance_record_sn(sk, prot, &tls_ctx->rx); ctx->decrypted = true; ctx->saved_data_ready(sk); } else { @@ -2013,8 +2014,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } #ifdef CONFIG_TLS_DEVICE - handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, - *(u64*)tls_ctx->rx.rec_seq); + tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, + TCP_SKB_CB(skb)->seq + rxm->offset); #endif return data_len + TLS_HEADER_SIZE; @@ -2281,8 +2282,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } - /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { + /* Sanity-check the sizes for stack allocations. */ + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || + rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { rc = -EINVAL; goto free_priv; } diff --git a/net/unix/diag.c b/net/unix/diag.c index c51a707260fa..9ff64f9df1f3 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -5,9 +5,11 @@ #include <linux/unix_diag.h> #include <linux/skbuff.h> #include <linux/module.h> +#include <linux/uidgid.h> #include <net/netlink.h> #include <net/af_unix.h> #include <net/tcp_states.h> +#include <net/sock.h> static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { @@ -111,6 +113,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +{ + uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { @@ -157,6 +165,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) goto out_nlmsg_trim; + if ((req->udiag_show & UDIAG_SHOW_UID) && + sk_diag_dump_uid(sk, skb)) + goto out_nlmsg_trim; + nlmsg_end(skb, nlh); return 0; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 169112f8aa1e..ab47bf3ab66e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -274,7 +274,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_bound(vsk); + if (__vsock_in_bound_table(vsk)) + __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); @@ -282,7 +283,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_connected(vsk); + if (__vsock_in_connected_table(vsk)) + __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); @@ -318,35 +320,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); -static bool vsock_in_bound_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_bound_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - -static bool vsock_in_connected_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_connected_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - void vsock_remove_sock(struct vsock_sock *vsk) { - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -477,8 +454,7 @@ static void vsock_pending_work(struct work_struct *work) * incoming packets can't find this socket, and to reduce the reference * count. */ - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 62dcdf082349..f2084e3f7aa4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -14,14 +14,14 @@ #include <net/sock.h> #include <net/af_vsock.h> -/* The host side's design of the feature requires 6 exact 4KB pages for - * recv/send rings respectively -- this is suboptimal considering memory - * consumption, however unluckily we have to live with it, before the - * host comes up with a better design in the future. +/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some + * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer + * hosts don't have this limitation; but, keep the defaults the same for compat. */ #define PAGE_SIZE_4K 4096 #define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) #define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) +#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -46,8 +46,9 @@ struct hvs_recv_buf { }; /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use - * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated - * buffer, because tests show there is no significant performance difference. + * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the + * guest and the host processing as one VMBUS packet is the smallest processing + * unit. * * Note: the buffer can be eliminated in the future when we add new VMBus * ringbuffer APIs that allow us to directly copy data from userspace buffer @@ -321,8 +322,11 @@ static void hvs_open_connection(struct vmbus_channel *chan) struct sockaddr_vm addr; struct sock *sk, *new = NULL; struct vsock_sock *vnew = NULL; - struct hvsock *hvs, *hvs_new = NULL; + struct hvsock *hvs = NULL; + struct hvsock *hvs_new = NULL; + int rcvbuf; int ret; + int sndbuf; if_type = &chan->offermsg.offer.if_type; if_instance = &chan->offermsg.offer.if_instance; @@ -364,9 +368,34 @@ static void hvs_open_connection(struct vmbus_channel *chan) } set_channel_read_mode(chan, HV_CALL_DIRECT); - ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE, - RINGBUFFER_HVS_RCV_SIZE, NULL, 0, - hvs_channel_cb, conn_from_host ? new : sk); + + /* Use the socket buffer sizes as hints for the VMBUS ring size. For + * server side sockets, 'sk' is the parent socket and thus, this will + * allow the child sockets to inherit the size from the parent. Keep + * the mins to the default value and align to page size as per VMBUS + * requirements. + * For the max, the socket core library will limit the socket buffer + * size that can be set by the user, but, since currently, the hv_sock + * VMBUS ring buffer is physically contiguous allocation, restrict it + * further. + * Older versions of hv_sock host side code cannot handle bigger VMBUS + * ring buffer size. Use the version number to limit the change to newer + * versions. + */ + if (vmbus_proto_version < VERSION_WIN10_V5) { + sndbuf = RINGBUFFER_HVS_SND_SIZE; + rcvbuf = RINGBUFFER_HVS_RCV_SIZE; + } else { + sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); + sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); + sndbuf = ALIGN(sndbuf, PAGE_SIZE); + rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); + rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); + rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + } + + ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, + conn_from_host ? new : sk); if (ret != 0) { if (conn_from_host) { hvs_new->chan = NULL; @@ -424,6 +453,7 @@ static u32 hvs_get_local_cid(void) static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) { struct hvsock *hvs; + struct sock *sk = sk_vsock(vsk); hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); if (!hvs) @@ -431,7 +461,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->trans = hvs; hvs->vsk = vsk; - + sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE; + sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE; return 0; } @@ -627,7 +658,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, struct hvsock *hvs = vsk->trans; struct vmbus_channel *chan = hvs->chan; struct hvs_send_buf *send_buf; - ssize_t to_write, max_writable, ret; + ssize_t to_write, max_writable; + ssize_t ret = 0; + ssize_t bytes_written = 0; BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); @@ -635,20 +668,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, if (!send_buf) return -ENOMEM; - max_writable = hvs_channel_writable_bytes(chan); - to_write = min_t(ssize_t, len, max_writable); - to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); - - ret = memcpy_from_msg(send_buf->data, msg, to_write); - if (ret < 0) - goto out; + /* Reader(s) could be draining data from the channel as we write. + * Maximize bandwidth, by iterating until the channel is found to be + * full. + */ + while (len) { + max_writable = hvs_channel_writable_bytes(chan); + if (!max_writable) + break; + to_write = min_t(ssize_t, len, max_writable); + to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); + /* memcpy_from_msg is safe for loop as it advances the offsets + * within the message iterator. + */ + ret = memcpy_from_msg(send_buf->data, msg, to_write); + if (ret < 0) + goto out; - ret = hvs_send_data(hvs->chan, send_buf, to_write); - if (ret < 0) - goto out; + ret = hvs_send_data(hvs->chan, send_buf, to_write); + if (ret < 0) + goto out; - ret = to_write; + bytes_written += to_write; + len -= to_write; + } out: + /* If any data has been sent, return that */ + if (bytes_written) + ret = bytes_written; kfree(send_buf); return ret; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 53ad3dbb76fe..45d9afcff6d5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; } + for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { + /* + * Validate we have a policy (can be explicitly set to + * VENDOR_CMD_RAW_DATA which is non-NULL) and also that + * we have at least one of doit/dumpit. + */ + if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) + return -EINVAL; + if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && + !rdev->wiphy.vendor_commands[i].dumpit)) + return -EINVAL; + } + #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || diff --git a/net/wireless/core.h b/net/wireless/core.h index 84d36ca7a7ab..ee8388fe4a92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +struct cfg80211_internal_bss * +cfg80211_bss_update(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *tmp, + bool signal_valid, unsigned long ts); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 520d437aa8d1..fc83dd179c1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -571,6 +571,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PEER_MEASUREMENTS] = NLA_POLICY_NESTED(nl80211_pmsr_attr_policy), [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, + .len = SAE_PASSWORD_MAX_LEN }, + [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4447,6 +4450,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD) && auth_type == NL80211_AUTHTYPE_SAE) return false; @@ -4637,6 +4642,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.twt_responder = + nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) @@ -8751,7 +8759,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) static bool nl80211_valid_wpa_versions(u32 wpa_versions) { return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | - NL80211_WPA_VERSION_2)); + NL80211_WPA_VERSION_2 | + NL80211_WPA_VERSION_3)); } static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) @@ -8987,6 +8996,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } + if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD)) + return -EINVAL; + settings->sae_pwd = + nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + settings->sae_pwd_len = + nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + } + return 0; } @@ -12669,6 +12688,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb, return 0; } +static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + if (vcmd->policy == VENDOR_CMD_RAW_DATA) { + if (attr->nla_type & NLA_F_NESTED) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unexpected nested data"); + return -EINVAL; + } + + return 0; + } + + if (!(attr->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data"); + return -EINVAL; + } + + return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, + extack); +} + static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -12727,11 +12769,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]); len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy(vcmd, + info->attrs[NL80211_ATTR_VENDOR_DATA], + info->extack); + if (err) + return err; } rdev->cur_cmd_info = info; - err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev, - data, len); + err = vcmd->doit(&rdev->wiphy, wdev, data, len); rdev->cur_cmd_info = NULL; return err; } @@ -12818,6 +12865,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy( + &(*rdev)->wiphy.vendor_commands[vcmd_idx], + attrbuf[NL80211_ATTR_VENDOR_DATA], + cb->extack); + if (err) + return err; } /* 0 is the first index - add 1 to parse only once */ @@ -15086,7 +15140,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, return; } - if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -15376,6 +15432,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); +void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan); + nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL, + rdev, wdev, cookie, chan, 0, gfp); +} +EXPORT_SYMBOL(cfg80211_tx_mgmt_expired); + void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { diff --git a/net/wireless/scan.c b/net/wireless/scan.c index aa571d727903..d66e6d4b7555 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss { }; /* Returned bss is reference counted and must be cleaned up appropriately. */ -static struct cfg80211_internal_bss * +struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, - bool signal_valid) + bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; if (WARN_ON(!tmp->pub.channel)) return NULL; - tmp->ts = jiffies; + tmp->ts = ts; spin_lock_bh(&rdev->bss_lock); @@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1972,6 +1974,27 @@ out: } EXPORT_SYMBOL(cfg80211_unlink_bss); +void cfg80211_bss_iter(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + void (*iter)(struct wiphy *wiphy, + struct cfg80211_bss *bss, + void *data), + void *iter_data) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_internal_bss *bss; + + spin_lock_bh(&rdev->bss_lock); + + list_for_each_entry(bss, &rdev->bss_list, list) { + if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel)) + iter(wiphy, &bss->pub, iter_data); + } + + spin_unlock_bh(&rdev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_bss_iter); + #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7d34cb884840..7a6c38ddc65a 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev, u8 *next; if (params->bss) { - /* Make sure the bss entry provided by the driver is valid. */ struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); - if (WARN_ON(list_empty(&ibss->list))) { - cfg80211_put_bss(wdev->wiphy, params->bss); - return; + if (list_empty(&ibss->list)) { + struct cfg80211_bss *found = NULL, *tmp = params->bss; + + found = cfg80211_get_bss(wdev->wiphy, NULL, + params->bss->bssid, + wdev->ssid, wdev->ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (found) { + /* The same BSS is already updated so use it + * instead, as it has latest info. + */ + params->bss = found; + } else { + /* Update with BSS provided by driver, it will + * be freshly added and ref cnted, we can free + * the old one. + * + * signal_valid can be false, as we are not + * expecting the BSS to be found. + * + * keep the old timestamp to avoid confusion + */ + cfg80211_bss_update(rdev, ibss, false, + ibss->ts); + } + + cfg80211_put_bss(wdev->wiphy, tmp); } } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2abfff925aac..4fbb91a511ae 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired, WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_tx_mgmt_expired, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan), + TP_ARGS(wdev, cookie, chan), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT, + WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) +); + TRACE_EVENT(cfg80211_new_sta, TP_PROTO(struct net_device *netdev, const u8 *mac_addr, struct station_info *sinfo), |