summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c1
-rw-r--r--net/Kconfig2
-rw-r--r--net/bridge/br_device.c1
-rw-r--r--net/bridge/br_netfilter_hooks.c247
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_private.h1
-rw-r--r--net/bridge/netfilter/Kconfig14
-rw-r--r--net/bridge/netfilter/Makefile3
-rw-r--r--net/bridge/netfilter/ebt_dnat.c2
-rw-r--r--net/bridge/netfilter/ebt_redirect.c2
-rw-r--r--net/bridge/netfilter/ebt_snat.c2
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c435
-rw-r--r--net/core/bpf_sk_storage.c12
-rw-r--r--net/core/devlink.c258
-rw-r--r--net/core/ethtool.c24
-rw-r--r--net/core/filter.c89
-rw-r--r--net/core/flow_dissector.c26
-rw-r--r--net/core/flow_offload.c10
-rw-r--r--net/core/hwbm.c15
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/net_namespace.c28
-rw-r--r--net/core/netpoll.c10
-rw-r--r--net/core/page_pool.c95
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c108
-rw-r--r--net/core/sock.c4
-rw-r--r--net/core/sock_map.c9
-rw-r--r--net/core/sock_reuseport.c24
-rw-r--r--net/core/xdp.c120
-rw-r--r--net/dsa/Kconfig1
-rw-r--r--net/dsa/dsa2.c92
-rw-r--r--net/dsa/dsa_priv.h17
-rw-r--r--net/dsa/port.c166
-rw-r--r--net/dsa/slave.c182
-rw-r--r--net/dsa/tag_8021q.c57
-rw-r--r--net/dsa/tag_sja1105.c213
-rw-r--r--net/ethernet/eth.c14
-rw-r--r--net/ieee802154/6lowpan/reassembly.c51
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/devinet.c160
-rw-r--r--net/ipv4/fib_frontend.c73
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_rules.c8
-rw-r--r--net/ipv4/fib_semantics.c364
-rw-r--r--net/ipv4/fib_trie.c163
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/igmp.c5
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/inet_fragment.c130
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/ip_fragment.c81
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c350
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c395
-rw-r--r--net/ipv4/netfilter/iptable_raw.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c2
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c9
-rw-r--r--net/ipv4/nexthop.c1828
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/route.c152
-rw-r--r--net/ipv4/sysctl_net_ipv4.c96
-rw-r--r--net/ipv4/tcp.c52
-rw-r--r--net/ipv4/tcp_fastopen.c201
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_ipv4.c24
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/udp.c24
-rw-r--r--net/ipv4/udp_offload.c2
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/addrconf_core.c6
-rw-r--r--net/ipv6/af_inet6.c5
-rw-r--r--net/ipv6/fib6_rules.c12
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/inet6_hashtables.c2
-rw-r--r--net/ipv6/ip6_fib.c214
-rw-r--r--net/ipv6/ip6_output.c340
-rw-r--r--net/ipv6/ndisc.c11
-rw-r--r--net/ipv6/netfilter.c129
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c420
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c53
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/reassembly.c52
-rw-r--r--net/ipv6/route.c1433
-rw-r--r--net/ipv6/sysctl_net_ipv6.c3
-rw-r--r--net/ipv6/tcp_ipv6.c29
-rw-r--r--net/ipv6/udp.c26
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/l2tp_debugfs.c21
-rw-r--r--net/l3mdev/l3mdev.c7
-rw-r--r--net/lapb/lapb_iface.c3
-rw-r--r--net/mac80211/cfg.c7
-rw-r--r--net/mac80211/debugfs.c1
-rw-r--r--net/mac80211/debugfs_key.c3
-rw-r--r--net/mac80211/debugfs_netdev.c10
-rw-r--r--net/mac80211/debugfs_sta.c2
-rw-r--r--net/mac80211/key.c100
-rw-r--r--net/mac80211/main.c4
-rw-r--r--net/mac80211/mlme.c25
-rw-r--r--net/mac80211/offchannel.c4
-rw-r--r--net/mac80211/rate.c27
-rw-r--r--net/mac80211/rc80211_minstrel.c4
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c3
-rw-r--r--net/mac80211/sta_info.c43
-rw-r--r--net/netfilter/core.c22
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c4
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c5
-rw-r--r--net/netfilter/ipset/ip_set_core.c97
-rw-r--r--net/netfilter/ipset/ip_set_getport.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c4
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c2
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c3
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c83
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c155
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c9
-rw-r--r--net/netfilter/nf_conntrack_core.c25
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c2
-rw-r--r--net/netfilter/nf_conntrack_proto.c126
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c2
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c4
-rw-r--r--net/netfilter/nf_flow_table_core.c1
-rw-r--r--net/netfilter/nf_nat_helper.c4
-rw-r--r--net/netfilter/nf_nat_proto.c24
-rw-r--r--net/netfilter/nf_nat_redirect.c12
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_synproxy_core.c898
-rw-r--r--net/netfilter/nf_tables_api.c52
-rw-r--r--net/netfilter/nfnetlink_osf.c5
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netfilter/nft_ct.c142
-rw-r--r--net/netfilter/nft_dynset.c2
-rw-r--r--net/netfilter/nft_exthdr.c136
-rw-r--r--net/netfilter/nft_payload.c6
-rw-r--r--net/netfilter/xt_DSCP.c8
-rw-r--r--net/netfilter/xt_HL.c4
-rw-r--r--net/netfilter/xt_TCPMSS.c2
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c28
-rw-r--r--net/netfilter/xt_iprange.c4
-rw-r--r--net/netfilter/xt_owner.c26
-rw-r--r--net/netfilter/xt_set.c45
-rw-r--r--net/netlink/af_netlink.c20
-rw-r--r--net/openvswitch/datapath.c2
-rw-r--r--net/openvswitch/vport.c2
-rw-r--r--net/packet/af_packet.c99
-rw-r--r--net/packet/internal.h1
-rw-r--r--net/rds/ib.c2
-rw-r--r--net/sched/Kconfig25
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_ctinfo.c407
-rw-r--r--net/sched/cls_flower.c17
-rw-r--r--net/sched/cls_fw.c13
-rw-r--r--net/sched/cls_matchall.c9
-rw-r--r--net/sched/cls_u32.c15
-rw-r--r--net/sched/sch_ingress.c2
-rw-r--r--net/sctp/offload.c7
-rw-r--r--net/sctp/protocol.c2
-rw-r--r--net/smc/af_smc.c73
-rw-r--r--net/smc/smc_clc.c11
-rw-r--r--net/socket.c2
-rw-r--r--net/strparser/strparser.c8
-rw-r--r--net/tipc/bcast.c4
-rw-r--r--net/tipc/link.c115
-rw-r--r--net/tipc/msg.h4
-rw-r--r--net/tipc/netlink.c2
-rw-r--r--net/tipc/netlink_compat.c10
-rw-r--r--net/tipc/node.c2
-rw-r--r--net/tls/tls_device.c168
-rw-r--r--net/tls/tls_device_fallback.c12
-rw-r--r--net/tls/tls_sw.c26
-rw-r--r--net/unix/diag.c12
-rw-r--r--net/vmw_vsock/af_vsock.c38
-rw-r--r--net/vmw_vsock/hyperv_transport.c93
-rw-r--r--net/wireless/core.c13
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/nl80211.c77
-rw-r--r--net/wireless/scan.c33
-rw-r--r--net/wireless/sme.c32
-rw-r--r--net/wireless/trace.h18
203 files changed, 9752 insertions, 3302 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index a0b2d8b9def7..93eadf179123 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -580,6 +580,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
dev->hw_enc_features = vlan_tnl_features(real_dev);
+ dev->mpls_features = real_dev->mpls_features;
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
diff --git a/net/Kconfig b/net/Kconfig
index d122f53c6fa2..57f51a279ad6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -67,8 +67,6 @@ source "net/xdp/Kconfig"
config INET
bool "TCP/IP networking"
- select CRYPTO
- select CRYPTO_AES
---help---
These are the protocols used on the Internet and on most local
Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c05def8fd9cd..681b72862c16 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -52,6 +52,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_switchdev_frame_unmark(skb);
BR_INPUT_SKB_CB(skb)->brdev = dev;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
skb_reset_mac_header(skb);
eth = eth_hdr(skb);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 34fa72c72ad8..d3f9592f4ff8 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -47,25 +47,22 @@ static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
bool enabled;
-};
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
+ struct ctl_table_header *ctl_hdr;
#endif
+ /* default value is 1 */
+ int call_iptables;
+ int call_ip6tables;
+ int call_arptables;
+
+ /* default value is 0 */
+ int filter_vlan_tagged;
+ int filter_pppoe_tagged;
+ int pass_vlan_indev;
+};
+
#define IS_IP(skb) \
(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
@@ -85,17 +82,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
return 0;
}
-#define IS_VLAN_IP(skb) \
- (vlan_proto(skb) == htons(ETH_P_IP) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
-#define IS_VLAN_IPV6(skb) \
- (vlan_proto(skb) == htons(ETH_P_IPV6) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_VLAN_ARP(skb) \
- (vlan_proto(skb) == htons(ETH_P_ARP) && \
- brnf_filter_vlan_tagged)
+ return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+ brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
static inline __be16 pppoe_proto(const struct sk_buff *skb)
{
@@ -103,15 +111,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
sizeof(struct pppoe_hdr)));
}
-#define IS_PPPOE_IP(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IP) && \
- brnf_filter_pppoe_tagged)
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_PPPOE_IPV6(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IPV6) && \
- brnf_filter_pppoe_tagged)
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IPV6) &&
+ brnet->filter_pppoe_tagged;
+}
/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -408,12 +424,16 @@ bridged_dnat:
return 0;
}
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct net *net)
{
struct net_device *vlan, *br;
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
br = bridge_parent(dev);
- if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+
+ if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
return br;
vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -423,7 +443,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
}
/* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
@@ -434,7 +454,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
nf_bridge->in_prerouting = 1;
nf_bridge->physindev = skb->dev;
- skb->dev = brnf_get_logical_dev(skb, skb->dev);
+ skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
if (skb->protocol == htons(ETH_P_8021Q))
nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -460,6 +480,7 @@ static unsigned int br_nf_pre_routing(void *priv,
struct net_bridge_port *p;
struct net_bridge *br;
__u32 len = nf_bridge_encap_header_len(skb);
+ struct brnf_net *brnet;
if (unlikely(!pskb_may_pull(skb, len)))
return NF_DROP;
@@ -469,8 +490,10 @@ static unsigned int br_nf_pre_routing(void *priv,
return NF_DROP;
br = p->br;
- if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
- if (!brnf_call_ip6tables &&
+ brnet = net_generic(state->net, brnf_net_id);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net)) {
+ if (!brnet->call_ip6tables &&
!br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
@@ -478,10 +501,11 @@ static unsigned int br_nf_pre_routing(void *priv,
return br_nf_pre_routing_ipv6(priv, skb, state);
}
- if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+ if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
return NF_ACCEPT;
- if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+ if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+ !is_pppoe_ip(skb, state->net))
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
@@ -491,7 +515,7 @@ static unsigned int br_nf_pre_routing(void *priv,
if (!nf_bridge_alloc(skb))
return NF_DROP;
- if (!setup_pre_routing(skb))
+ if (!setup_pre_routing(skb, state->net))
return NF_DROP;
nf_bridge = nf_bridge_info_get(skb);
@@ -514,7 +538,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct net_device *in;
- if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+ if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
if (skb->protocol == htons(ETH_P_IP))
nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -569,9 +593,11 @@ static unsigned int br_nf_forward_ip(void *priv,
if (!parent)
return NF_DROP;
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
pf = NFPROTO_IPV6;
else
return NF_ACCEPT;
@@ -602,7 +628,7 @@ static unsigned int br_nf_forward_ip(void *priv,
skb->protocol = htons(ETH_P_IPV6);
NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
- brnf_get_logical_dev(skb, state->in),
+ brnf_get_logical_dev(skb, state->in, state->net),
parent, br_nf_forward_finish);
return NF_STOLEN;
@@ -615,23 +641,25 @@ static unsigned int br_nf_forward_arp(void *priv,
struct net_bridge_port *p;
struct net_bridge *br;
struct net_device **d = (struct net_device **)(skb->cb);
+ struct brnf_net *brnet;
p = br_port_get_rcu(state->out);
if (p == NULL)
return NF_ACCEPT;
br = p->br;
- if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
+ brnet = net_generic(state->net, brnf_net_id);
+ if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
return NF_ACCEPT;
if (!IS_ARP(skb)) {
- if (!IS_VLAN_ARP(skb))
+ if (!is_vlan_arp(skb, state->net))
return NF_ACCEPT;
nf_bridge_pull_encap_header(skb);
}
if (arp_hdr(skb)->ar_pln != 4) {
- if (IS_VLAN_ARP(skb))
+ if (is_vlan_arp(skb, state->net))
nf_bridge_push_encap_header(skb);
return NF_ACCEPT;
}
@@ -791,9 +819,11 @@ static unsigned int br_nf_post_routing(void *priv,
if (!realoutdev)
return NF_DROP;
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
pf = NFPROTO_IPV6;
else
return NF_ACCEPT;
@@ -946,23 +976,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event,
return NOTIFY_OK;
}
-static void __net_exit brnf_exit_net(struct net *net)
-{
- struct brnf_net *brnet = net_generic(net, brnf_net_id);
-
- if (!brnet->enabled)
- return;
-
- nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
- brnet->enabled = false;
-}
-
-static struct pernet_operations brnf_net_ops __read_mostly = {
- .exit = brnf_exit_net,
- .id = &brnf_net_id,
- .size = sizeof(struct brnf_net),
-};
-
static struct notifier_block brnf_notifier __read_mostly = {
.notifier_call = brnf_device_event,
};
@@ -1021,49 +1034,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
static struct ctl_table brnf_table[] = {
{
.procname = "bridge-nf-call-arptables",
- .data = &brnf_call_arptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-iptables",
- .data = &brnf_call_iptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-ip6tables",
- .data = &brnf_call_ip6tables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-vlan-tagged",
- .data = &brnf_filter_vlan_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-pppoe-tagged",
- .data = &brnf_filter_pppoe_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-pass-vlan-input-dev",
- .data = &brnf_pass_vlan_indev,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{ }
};
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+ brnf->call_iptables = 1;
+ brnf->call_ip6tables = 1;
+ brnf->call_arptables = 1;
+ brnf->filter_vlan_tagged = 0;
+ brnf->filter_pppoe_tagged = 0;
+ brnf->pass_vlan_indev = 0;
+}
+
+static int br_netfilter_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table = brnf_table;
+ struct brnf_net *brnet;
+
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ brnet = net_generic(net, brnf_net_id);
+ table[0].data = &brnet->call_arptables;
+ table[1].data = &brnet->call_iptables;
+ table[2].data = &brnet->call_ip6tables;
+ table[3].data = &brnet->filter_vlan_tagged;
+ table[4].data = &brnet->filter_pppoe_tagged;
+ table[5].data = &brnet->pass_vlan_indev;
+
+ br_netfilter_sysctl_default(brnet);
+
+ brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table);
+ if (!brnet->ctl_hdr) {
+ if (!net_eq(net, &init_net))
+ kfree(table);
+
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void br_netfilter_sysctl_exit_net(struct net *net,
+ struct brnf_net *brnet)
+{
+ struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(brnet->ctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static int __net_init brnf_init_net(struct net *net)
+{
+ return br_netfilter_sysctl_init_net(net);
+}
+#endif
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet;
+
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled) {
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+ }
+
+#ifdef CONFIG_SYSCTL
+ br_netfilter_sysctl_exit_net(net, brnet);
#endif
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+#ifdef CONFIG_SYSCTL
+ .init = brnf_init_net,
+#endif
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
static int __init br_netfilter_init(void)
{
@@ -1079,16 +1167,6 @@ static int __init br_netfilter_init(void)
return ret;
}
-#ifdef CONFIG_SYSCTL
- brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
- if (brnf_sysctl_header == NULL) {
- printk(KERN_WARNING
- "br_netfilter: can't register to sysctl.\n");
- unregister_netdevice_notifier(&brnf_notifier);
- unregister_pernet_subsys(&brnf_net_ops);
- return -ENOMEM;
- }
-#endif
RCU_INIT_POINTER(nf_br_ops, &br_ops);
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
@@ -1099,9 +1177,6 @@ static void __exit br_netfilter_fini(void)
RCU_INIT_POINTER(nf_br_ops, NULL);
unregister_netdevice_notifier(&brnf_notifier);
unregister_pernet_subsys(&brnf_net_ops);
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(brnf_sysctl_header);
-#endif
}
module_init(br_netfilter_init);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 0e63e5dc5ac4..e4e0c836c3f5 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -224,7 +224,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
nf_bridge = nf_bridge_alloc(skb);
if (!nf_bridge)
return NF_DROP;
- if (!setup_pre_routing(skb))
+ if (!setup_pre_routing(skb, state->net))
return NF_DROP;
nf_bridge = nf_bridge_info_get(skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 159a0e2cb0f6..e8cf03b43b7d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -421,6 +421,7 @@ struct net_bridge {
struct br_input_skb_cb {
struct net_device *brdev;
+ u16 frag_max_size;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
u8 igmp;
u8 mrouters_only:1;
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c3ad90c43801..f4fb0b9b927d 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -19,6 +19,20 @@ config NF_LOG_BRIDGE
tristate "Bridge packet logging"
select NF_LOG_COMMON
+config NF_CONNTRACK_BRIDGE
+ tristate "IPv4/IPV6 bridge connection tracking support"
+ depends on NF_CONNTRACK
+ default n
+ help
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections. This is used to enhance packet filtering via
+ stateful policies. Enable this if you want native tracking from
+ the bridge. This provides a replacement for the `br_netfilter'
+ infrastructure.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
endif # NF_TABLES_BRIDGE
menuconfig BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 9b868861f21a..9d7767322a64 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -5,6 +5,9 @@
obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o
+
# packet logging
obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index eeae23a73c6a..ed91ea31978a 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct ebt_nat_info *info = par->targinfo;
struct net_device *dev;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 53ef08e6765f..0cad62a4052b 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_redirect_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN))
return EBT_DROP;
if (xt_hooknum(par) != NF_BR_BROUTING)
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 700d338d5ddb..27443bf229a3 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN * 2))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
new file mode 100644
index 000000000000..4f5444d2a526
--- /dev/null
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+
+#include "../br_private.h"
+
+/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff
+ * has been linearized or cloned.
+ */
+static int nf_br_ip_fragment(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ unsigned int hlen, ll_rs, mtu;
+ struct ip_frag_state state;
+ struct iphdr *iph;
+ int err;
+
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * Setup starting values
+ */
+
+ hlen = iph->ihl * 4;
+ frag_max_size -= hlen;
+ ll_rs = LL_RESERVED_SPACE(skb->dev);
+ mtu = skb->dev->mtu;
+
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip_fraglist_iter iter;
+ struct sk_buff *frag;
+
+ if (first_len - hlen > mtu ||
+ skb_headroom(skb) < ll_rs)
+ goto blackhole;
+
+ if (skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ if (frag->len > mtu ||
+ skb_headroom(frag) < hlen + ll_rs)
+ goto blackhole;
+
+ if (skb_shared(frag))
+ goto slow_path;
+ }
+
+ ip_fraglist_init(skb, iph, hlen, &iter);
+
+ for (;;) {
+ if (iter.frag)
+ ip_fraglist_prepare(skb, &iter);
+
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip_fraglist_next(&iter);
+ }
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* ip_defrag() expects IPCB() in place. */
+static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb,
+ size_t inet_skb_parm_size)
+{
+ memcpy(cb, skb->cb, sizeof(*cb));
+ memset(skb->cb, 0, inet_skb_parm_size);
+}
+
+static void br_skb_cb_restore(struct sk_buff *skb,
+ const struct br_input_skb_cb *cb,
+ u16 fragsz)
+{
+ memcpy(skb->cb, cb, sizeof(*cb));
+ BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz;
+}
+
+static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ if (!ip_is_fragment(ip_hdr(skb)))
+ return NF_ACCEPT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ local_bh_enable();
+ if (!err) {
+ br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size);
+ skb->ignore_df = 1;
+ return NF_ACCEPT;
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+ err = nf_ipv6_br_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ /* queued */
+ if (err == -EINPROGRESS)
+ return NF_STOLEN;
+
+ br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+ return err == 0 ? NF_ACCEPT : NF_DROP;
+}
+
+static int nf_ct_br_ip_check(const struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 ||
+ iph->version != 4)
+ return -1;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < nhoff + len ||
+ len < (iph->ihl * 4))
+ return -1;
+
+ return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return -1;
+
+ len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+ if (skb->len < len)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state bridge_state = *state;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u32 len;
+ int ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) ||
+ ctinfo == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return NF_ACCEPT;
+
+ len = ntohs(ip_hdr(skb)->tot_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ip_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV4;
+ ret = nf_ct_br_defrag4(skb, &bridge_state);
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return NF_ACCEPT;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ipv6_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV6;
+ ret = nf_ct_br_defrag6(skb, &bridge_state);
+ break;
+ default:
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_conntrack_in(skb, &bridge_state);
+}
+
+static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data)
+{
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_present = true;
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_present = false;
+ }
+ skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+}
+
+static unsigned int
+nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ struct nf_ct_bridge_frag_data data;
+
+ if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
+ return NF_ACCEPT;
+
+ nf_ct_bridge_frag_save(skb, &data);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return NF_STOLEN;
+}
+
+/* Actually only slow path refragmentation needs this. */
+static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
+ const struct nf_ct_bridge_frag_data *data)
+{
+ int err;
+
+ err = skb_cow_head(skb, ETH_HLEN);
+ if (err) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (data->vlan_present)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+ else if (skb_vlan_tag_present(skb))
+ __vlan_hwaccel_clear_tag(skb);
+
+ skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ return 0;
+}
+
+static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *skb)
+{
+ int err;
+
+ err = nf_ct_bridge_frag_restore(skb, data);
+ if (err < 0)
+ return err;
+
+ return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int protoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return nf_conntrack_confirm(skb);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+ case htons(ETH_P_IPV6): {
+ unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return nf_conntrack_confirm(skb);
+ }
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+ return nf_confirm(skb, protoff, ct, ctinfo);
+}
+
+static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int ret;
+
+ ret = nf_ct_bridge_confirm(skb);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post);
+}
+
+static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+ {
+ .hook = nf_ct_bridge_pre,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = nf_ct_bridge_post,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+static struct nf_ct_bridge_info bridge_info = {
+ .ops = nf_ct_bridge_hook_ops,
+ .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops),
+ .me = THIS_MODULE,
+};
+
+static int __init nf_conntrack_l3proto_bridge_init(void)
+{
+ nf_ct_bridge_register(&bridge_info);
+
+ return 0;
+}
+
+static void __exit nf_conntrack_l3proto_bridge_fini(void)
+{
+ nf_ct_bridge_unregister(&bridge_info);
+}
+
+module_init(nf_conntrack_l3proto_bridge_init);
+module_exit(nf_conntrack_l3proto_bridge_fini);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE));
+MODULE_LICENSE("GPL");
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d1c4e1f3be2c..94c7f77ecb6b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
unsigned int i;
u32 nbuckets;
u64 cost;
+ int ret;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
if (!smap)
@@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
nbuckets = 1U << smap->bucket_log;
+ cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
+
+ ret = bpf_map_charge_init(&smap->map.memory, cost);
+ if (ret < 0) {
+ kfree(smap);
+ return ERR_PTR(ret);
+ }
+
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
+ bpf_map_charge_finish(&smap->map.memory);
kfree(smap);
return ERR_PTR(-ENOMEM);
}
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
@@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
BPF_SK_STORAGE_CACHE_SIZE;
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
return &smap->map;
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 132f4b757963..4baf716e535e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -17,6 +17,7 @@
#include <linux/netdevice.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
+#include <linux/workqueue.h>
#include <rdma/ib_verbs.h>
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -2668,6 +2669,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
}
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done, unsigned long total)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ status_msg))
+ goto nla_put_failure;
+ if (component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ component))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ done, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
+ component, done, total);
+ if (err)
+ goto out_free_msg;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
+
+void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ status_msg, component, done, total);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
@@ -4415,6 +4518,35 @@ nla_put_failure:
return err;
}
+static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum devlink_command cmd)
+{
+ int index = cb->args[0];
+ int tmp_index = index;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if ((err && err != -EMSGSIZE) || tmp_index == index)
+ goto nla_put_failure;
+
+ cb->args[0] = index;
+ genlmsg_end(skb, hdr);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return err;
+}
+
struct devlink_health_reporter {
struct list_head list;
void *priv;
@@ -4647,17 +4779,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
EXPORT_SYMBOL_GPL(devlink_health_report);
static struct devlink_health_reporter *
-devlink_health_reporter_get_from_info(struct devlink *devlink,
- struct genl_info *info)
+devlink_health_reporter_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
{
struct devlink_health_reporter *reporter;
char *reporter_name;
- if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+ if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
return NULL;
- reporter_name =
- nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+ reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
mutex_lock(&devlink->reporters_lock);
reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
if (reporter)
@@ -4666,6 +4797,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink,
return reporter;
}
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
+{
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink;
+ struct nlattr **attrs;
+ int err;
+
+ attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ err = nlmsg_parse_deprecated(cb->nlh,
+ GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX,
+ devlink_nl_family.policy, cb->extack);
+ if (err)
+ goto free;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto unlock;
+
+ reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ mutex_unlock(&devlink_mutex);
+ kfree(attrs);
+ return reporter;
+unlock:
+ mutex_unlock(&devlink_mutex);
+free:
+ kfree(attrs);
+ return NULL;
+}
+
static void
devlink_health_reporter_put(struct devlink_health_reporter *reporter)
{
@@ -4901,32 +5074,40 @@ out:
return err;
}
-static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int
+devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
+ u64 start = cb->args[0];
int err;
- reporter = devlink_health_reporter_get_from_info(devlink, info);
+ reporter = devlink_health_reporter_get_from_cb(cb);
if (!reporter)
return -EINVAL;
if (!reporter->ops->dump) {
- devlink_health_reporter_put(reporter);
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto out;
}
-
mutex_lock(&reporter->dump_lock);
- err = devlink_health_do_dump(reporter, NULL);
- if (err)
- goto out;
-
- err = devlink_fmsg_snd(reporter->dump_fmsg, info,
- DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0);
+ if (!start) {
+ err = devlink_health_do_dump(reporter, NULL);
+ if (err)
+ goto unlock;
+ cb->args[1] = reporter->dump_ts;
+ }
+ if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) {
+ NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry");
+ err = -EAGAIN;
+ goto unlock;
+ }
-out:
+ err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
+ DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+unlock:
mutex_unlock(&reporter->dump_lock);
+out:
devlink_health_reporter_put(reporter);
return err;
}
@@ -5263,7 +5444,7 @@ static const struct genl_ops devlink_nl_ops[] = {
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .doit = devlink_nl_cmd_health_reporter_dump_get_doit,
+ .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
DEVLINK_NL_FLAG_NO_LOCK,
@@ -5386,6 +5567,38 @@ void devlink_free(struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_free);
+static void devlink_port_type_warn(struct work_struct *work)
+{
+ WARN(true, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+ /* Ignore CPU and DSA flavours. */
+ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ /* Schedule a work to WARN in case driver does not set port
+ * type within timeout.
+ */
+ schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
/**
* devlink_port_register - Register devlink port
*
@@ -5415,6 +5628,8 @@ int devlink_port_register(struct devlink *devlink,
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
mutex_unlock(&devlink->lock);
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
}
@@ -5429,6 +5644,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
+ devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
@@ -5442,6 +5658,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
{
if (WARN_ON(!devlink_port->registered))
return;
+ devlink_port_type_warn_cancel(devlink_port);
spin_lock(&devlink_port->type_lock);
devlink_port->type = type;
devlink_port->type_dev = type_dev;
@@ -5515,6 +5732,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+ devlink_port_type_warn_schedule(devlink_port);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d1011b2e24f..6288e69e94fc 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
match->mask.basic.n_proto = htons(0xffff);
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case ETHER_FLOW: {
+ const struct ethhdr *ether_spec, *ether_m_spec;
+
+ ether_spec = &fs->h_u.ether_spec;
+ ether_m_spec = &fs->m_u.ether_spec;
+
+ if (!is_zero_ether_addr(ether_m_spec->h_source)) {
+ ether_addr_copy(match->key.eth_addrs.src,
+ ether_spec->h_source);
+ ether_addr_copy(match->mask.eth_addrs.src,
+ ether_m_spec->h_source);
+ }
+ if (!is_zero_ether_addr(ether_m_spec->h_dest)) {
+ ether_addr_copy(match->key.eth_addrs.dst,
+ ether_spec->h_dest);
+ ether_addr_copy(match->mask.eth_addrs.dst,
+ ether_m_spec->h_dest);
+ }
+ if (ether_m_spec->h_proto) {
+ match->key.basic.n_proto = ether_spec->h_proto;
+ match->mask.basic.n_proto = ether_m_spec->h_proto;
+ }
+ }
+ break;
case TCP_V4_FLOW:
case UDP_V4_FLOW: {
const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
diff --git a/net/core/filter.c b/net/core/filter.c
index f615e42cf4ef..2014d76e0d2a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -62,6 +62,7 @@
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
@@ -4670,7 +4671,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (res.type != RTN_UNICAST)
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (res.fi->fib_nhs > 1)
+ if (fib_info_num_path(res.fi) > 1)
fib_select_path(net, &res, &fl4, NULL);
if (check_mtu) {
@@ -5694,6 +5695,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
return INET_ECN_set_ce(skb);
}
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \
+ FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct xdp_sock, FIELD)); \
+ } while (0)
+
+ switch (si->off) {
+ case offsetof(struct bpf_xdp_sock, queue_id):
+ BPF_XDP_SOCK_GET(queue_id);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
.func = bpf_skb_ecn_set_ce,
.gpl_only = false,
@@ -5896,6 +5937,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skc_lookup_tcp:
return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5933,6 +5978,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
@@ -6113,6 +6162,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_sockopt_event_output_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
default:
return bpf_base_func_proto(func_id);
}
@@ -6800,6 +6857,13 @@ static bool sock_addr_is_valid_access(int off, int size,
if (size != size_default)
return false;
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ if (type != BPF_READ)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
default:
if (type == BPF_READ) {
if (size != size_default)
@@ -6843,6 +6907,11 @@ static bool sock_ops_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case offsetof(struct bpf_sock_ops, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ break;
default:
if (size != size_default)
return false;
@@ -7750,6 +7819,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_addr_kern, sk));
+ break;
}
return insn - insn_buf;
@@ -8009,6 +8083,19 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+ case offsetof(struct bpf_sock_ops, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ break;
}
return insn - insn_buf;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index edd622956083..01ad60b5aa75 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -199,6 +199,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
}
EXPORT_SYMBOL(__skb_flow_get_ports);
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_meta *meta;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+ return;
+
+ meta = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_META,
+ target_container);
+ meta->ingress_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
static void
skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
struct flow_dissector *flow_dissector,
@@ -757,7 +773,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
* @flags: flags that control the dissection process, e.g.
- * FLOW_DISSECTOR_F_STOP_AT_L3.
+ * FLOW_DISSECTOR_F_STOP_AT_ENCAP.
*
* The function will try to retrieve individual keys into target specified
* by flow_dissector from either the skbuff or a raw buffer specified by the
@@ -922,11 +938,6 @@ proto_again:
__skb_flow_dissect_ipv4(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
- break;
- }
-
break;
}
case htons(ETH_P_IPV6): {
@@ -975,9 +986,6 @@ proto_again:
__skb_flow_dissect_ipv6(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
-
break;
}
case htons(ETH_P_8021AD):
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 5ce7d47a960e..f52fe0bc4017 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
- rule = kzalloc(sizeof(struct flow_rule) +
- sizeof(struct flow_action_entry) * num_actions,
+ rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
if (!rule)
return NULL;
@@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc);
(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \
(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \
+void flow_rule_match_meta(const struct flow_rule *rule,
+ struct flow_match_meta *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
void flow_rule_match_basic(const struct flow_rule *rule,
struct flow_match_basic *out)
{
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index fd822ca5a245..ac1a66df9adc 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(hwbm_pool_refill);
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
{
int err, i;
- unsigned long flags;
- spin_lock_irqsave(&bm_pool->lock, flags);
+ mutex_lock(&bm_pool->buf_lock);
if (bm_pool->buf_num == bm_pool->size) {
pr_warn("pool already filled\n");
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return bm_pool->buf_num;
}
if (buf_num + bm_pool->buf_num > bm_pool->size) {
pr_warn("cannot allocate %d buffers for pool\n",
buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
buf_num, bm_pool->buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
for (i = 0; i < buf_num; i++) {
- err = hwbm_pool_refill(bm_pool, gfp);
+ err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
if (err < 0)
break;
}
@@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
bm_pool->buf_num += i;
pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return i;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9e7fc929bc50..742cea4ce72e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
int error;
struct neigh_hash_table *nht;
+ trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
+
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 470b179d599e..283ddb2dbc7d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
#endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
#include <trace/events/neigh.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 15f68842ac6b..198ce503ae73 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -145,6 +145,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -330,6 +341,12 @@ out_undo:
list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
+ synchronize_rcu();
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
ops = saved_ops;
@@ -541,10 +558,15 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
+ /* Run all of the network namespace pre_exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
*/
synchronize_rcu();
@@ -1101,6 +1123,8 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
@@ -1115,6 +1139,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
@@ -1139,6 +1165,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd8b1a460d64..2cf27da1baeb 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np)
if (!np->local_ip.ip) {
if (!np->ipv6) {
+ const struct in_ifaddr *ifa;
+
in_dev = __in_dev_get_rtnl(ndev);
+ if (!in_dev)
+ goto put_noaddr;
- if (!in_dev || !in_dev->ifa_list) {
+ ifa = rtnl_dereference(in_dev->ifa_list);
+ if (!ifa) {
+put_noaddr:
np_err(np, "no IP address for %s, aborting\n",
np->dev_name);
err = -EDESTADDRREQ;
goto put;
}
- np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np->local_ip.ip = ifa->ifa_local;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5b2252c6d49b..b366f59885c1 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,9 +4,11 @@
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/device.h>
#include <net/page_pool.h>
#include <linux/dma-direction.h>
@@ -14,6 +16,8 @@
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include <trace/events/page_pool.h>
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -43,6 +47,11 @@ static int page_pool_init(struct page_pool *pool,
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ get_device(pool->p.dev);
+
return 0;
}
@@ -151,6 +160,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
page->dma_addr = dma;
skip_dma_map:
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+
+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
}
@@ -173,6 +187,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
}
EXPORT_SYMBOL(page_pool_alloc_pages);
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
+
+ distance = _distance(hold_cnt, release_cnt);
+
+ trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
+ return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+ s32 inflight = page_pool_inflight(pool);
+
+ /* The distance should not be able to become negative */
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+ return (inflight == 0);
+}
+
/* Cleanup page_pool state from page */
static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
@@ -180,7 +221,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
dma_addr_t dma;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- return;
+ goto skip_dma_unmap;
dma = page->dma_addr;
/* DMA unmap */
@@ -188,12 +229,27 @@ static void __page_pool_clean_page(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
+skip_dma_unmap:
+ atomic_inc(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page,
+ atomic_read(&pool->pages_state_release_cnt));
}
+/* unmap the page and clean our state */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page)
+{
+ /* When page is unmapped, this implies page will not be
+ * returned to page_pool.
+ */
+ __page_pool_clean_page(pool, page);
+}
+EXPORT_SYMBOL(page_pool_unmap_page);
+
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
{
__page_pool_clean_page(pool, page);
+
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
@@ -285,21 +341,41 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+static void __warn_in_flight(struct page_pool *pool)
{
- struct page_pool *pool;
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
- pool = container_of(rcu, struct page_pool, rcu);
+ distance = _distance(hold_cnt, release_cnt);
+ /* Drivers should fix this, but only problematic when DMA is used */
+ WARN(1, "Still in-flight pages:%d hold:%u released:%u",
+ distance, hold_cnt, release_cnt);
+}
+
+void __page_pool_free(struct page_pool *pool)
+{
WARN(pool->alloc.count, "API usage violation");
+ WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+
+ /* Can happen due to forced shutdown */
+ if (!__page_pool_safe_to_destroy(pool))
+ __warn_in_flight(pool);
- __page_pool_empty_ring(pool);
ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
kfree(pool);
}
+EXPORT_SYMBOL(__page_pool_free);
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
{
struct page *page;
@@ -317,7 +393,6 @@ void page_pool_destroy(struct page_pool *pool)
*/
__page_pool_empty_ring(pool);
- /* An xdp_mem_allocator can still ref page_pool pointer */
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+ return __page_pool_safe_to_destroy(pool);
}
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f975c5e2a369..bb9915291644 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
- if (in_dev->ifa_list) {
- pkt_dev->saddr_min =
- in_dev->ifa_list->ifa_address;
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(in_dev->ifa_list);
+ if (ifa) {
+ pkt_dev->saddr_min = ifa->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cec60583931f..1ee6460f8275 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
struct nlattr *mx;
int i, valid = 0;
+ /* nothing is dumped for dst_default_metrics, so just skip the loop */
+ if (metrics == dst_default_metrics.metrics)
+ return 0;
+
mx = nla_nest_start_noflag(skb, RTA_METRICS);
if (mx == NULL)
return -ENOBUFS;
@@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(0) +
nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_broadcast)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
nla_total_size(MAX_VLAN_LIST_LEN *
@@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
struct ifla_vf_mac vf_mac;
+ struct ifla_vf_broadcast vf_broadcast;
struct ifla_vf_info ivi;
memset(&ivi, 0, sizeof(ivi));
@@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_trust.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
vf_vlan_info.vlan = ivi.vlan;
@@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
if (!vf)
goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
&vf_rate) ||
@@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_BROADCAST] = { .type = NLA_REJECT },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
[IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c8cd99c3603f..5323441a12cc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,6 +72,7 @@
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
+#include <linux/indirect_call_wrapper.h>
#include "datagram.h"
@@ -365,19 +366,21 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc;
- unsigned long flags;
- void *data;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- local_irq_save(flags);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, gfp_mask);
- local_irq_restore(flags);
- return data;
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
}
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
/**
* netdev_alloc_frag - allocate a page fragment
* @fragsz: fragment size
@@ -387,26 +390,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
*/
void *netdev_alloc_frag(unsigned int fragsz)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
-}
-EXPORT_SYMBOL(netdev_alloc_frag);
-
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
-}
+ struct page_frag_cache *nc;
+ void *data;
-void *napi_alloc_frag(unsigned int fragsz)
-{
fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
+ } else {
+ local_bh_disable();
+ data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ local_bh_enable();
+ }
+ return data;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+EXPORT_SYMBOL(netdev_alloc_frag);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -425,7 +423,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
gfp_t gfp_mask)
{
struct page_frag_cache *nc;
- unsigned long flags;
struct sk_buff *skb;
bool pfmemalloc;
void *data;
@@ -446,13 +443,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- local_irq_save(flags);
-
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
-
- local_irq_restore(flags);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ } else {
+ local_bh_disable();
+ nc = this_cpu_ptr(&napi_alloc_cache.page);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ local_bh_enable();
+ }
if (unlikely(!data))
return NULL;
@@ -909,6 +910,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
}
/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+ struct sk_buff *n;
+
+ n = alloc_skb(0, GFP_ATOMIC);
+ if (!n)
+ return NULL;
+
+ n->len = first->len;
+ n->data_len = first->len;
+ n->truesize = first->truesize;
+
+ skb_shinfo(n)->frag_list = first;
+
+ __copy_skb_header(n, first);
+ n->destructor = NULL;
+
+ return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
+/**
* skb_morph - morph one skb into another
* @dst: the skb to receive the contents
* @src: the skb to supply the contents
@@ -2508,7 +2534,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = ops->update(skb->data + offset, copy, csum);
+ csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
+ skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2535,9 +2562,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
frag->page_offset + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = ops->update(vaddr + p_off, p_len, 0);
+ csum2 = INDIRECT_CALL_1(ops->update,
+ csum_partial_ext,
+ vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = ops->combine(csum, csum2, pos, p_len);
+ csum = INDIRECT_CALL_1(ops->combine,
+ csum_block_add_ext, csum,
+ csum2, pos, p_len);
pos += p_len;
}
@@ -2560,7 +2591,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
copy = len;
csum2 = __skb_checksum(frag_iter, offset - start,
copy, 0, ops);
- csum = ops->combine(csum, csum2, pos, copy);
+ csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
+ csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
diff --git a/net/core/sock.c b/net/core/sock.c
index aa4a00d381e3..0eb21384079d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1039,6 +1039,10 @@ set_rcvbuf:
}
break;
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..52d4faeee18b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* Make sure page count doesn't overflow. */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- if (cost >= U32_MAX - PAGE_SIZE) {
- err = -EINVAL;
- goto free_stab;
- }
-
- stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- err = bpf_map_precharge_memlock(stab->map.pages);
+ err = bpf_map_charge_init(&stab->map.memory, cost);
if (err)
goto free_stab;
@@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (stab->sks)
return &stab->map;
err = -ENOMEM;
+ bpf_map_charge_finish(&stab->map.memory);
free_stab:
kfree(stab);
return ERR_PTR(err);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index dc4aefdf2a08..9408f9264d05 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+
+ old_prog = NULL;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ rcu_swap_protected(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
+ spin_unlock_bh(&reuseport_lock);
+
+ if (!old_prog)
+ return -ENOENT;
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8aab08b131d9..b29d7b513a18 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,6 +14,8 @@
#include <net/page_pool.h>
#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
-struct xdp_mem_allocator {
- struct xdp_mem_info mem;
- union {
- void *allocator;
- struct page_pool *page_pool;
- struct zero_copy_allocator *zc_alloc;
- };
- struct rhash_head node;
- struct rcu_head rcu;
-};
-
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
@@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+ /* Allocator have indicated safe to remove before this is called */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ page_pool_free(xa->page_pool);
+
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Notice, driver is expected to free the *allocator,
- * e.g. page_pool, and MUST also use RCU free.
- */
-
/* Poison memory */
xa->mem.id = 0xFFFF;
xa->mem.type = 0xF0F0;
@@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
+bool __mem_id_disconnect(int id, bool force)
+{
+ struct xdp_mem_allocator *xa;
+ bool safe_to_remove = true;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+ return true;
+ }
+ xa->disconnect_cnt++;
+
+ /* Detects in-flight packet-pages for page_pool */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+ trace_mem_disconnect(xa, safe_to_remove, force);
+
+ if ((safe_to_remove || force) &&
+ !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+ return (safe_to_remove|force);
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (30 * HZ)
+#define DEFER_MAX_RETRIES 120
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+ bool force = false;
+
+ if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
+ force = true;
+
+ if (__mem_id_disconnect(xa->mem.id, force))
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, xa->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+
+ pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
+ __func__, xa->mem.id, xa->disconnect_cnt, sec);
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
+ if (__mem_id_disconnect(id, false))
+ return;
+
+ /* Could not disconnect, defer new disconnect attempt to later */
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+ xa->defer_start = jiffies;
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
mutex_unlock(&mem_id_lock);
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -301,12 +364,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
+ ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
+ xdp_rxq->mem.id = 0;
errno = PTR_ERR(ptr);
goto err;
}
mutex_unlock(&mem_id_lock);
+ trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
err:
mutex_unlock(&mem_id_lock);
@@ -333,10 +399,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (xa) {
+ if (likely(xa)) {
napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
} else {
+ /* Hopefully stack show who to blame for late return */
+ WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
+ trace_mem_return_failed(mem, page);
put_page(page);
}
rcu_read_unlock();
@@ -379,6 +448,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_release_page(xa->page_pool, page);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__xdp_release_frame);
+
int xdp_attachment_query(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index d449f78c1bd0..6e942dda1bcd 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -106,6 +106,7 @@ config NET_DSA_TAG_LAN9303
config NET_DSA_TAG_SJA1105
tristate "Tag driver for NXP SJA1105 switches"
select NET_DSA_TAG_8021Q
+ select PACKING
help
Say Y or M if you want to enable support for tagging frames with the
NXP SJA1105 switch family. Both the native tagging protocol (which
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 820dd8da57fc..3abd173ebacb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -257,7 +257,7 @@ static int dsa_port_setup(struct dsa_port *dp)
enum devlink_port_flavour flavour;
struct dsa_switch *ds = dp->ds;
struct dsa_switch_tree *dst = ds->dst;
- int err;
+ int err = 0;
if (dp->type == DSA_PORT_TYPE_UNUSED)
return 0;
@@ -295,19 +295,15 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
case DSA_PORT_TYPE_CPU:
err = dsa_port_link_register_of(dp);
- if (err) {
+ if (err)
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
ds->index, dp->index);
- return err;
- }
break;
case DSA_PORT_TYPE_DSA:
err = dsa_port_link_register_of(dp);
- if (err) {
+ if (err)
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
ds->index, dp->index);
- return err;
- }
break;
case DSA_PORT_TYPE_USER:
err = dsa_slave_create(dp);
@@ -319,7 +315,10 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
}
- return 0;
+ if (err)
+ devlink_port_unregister(&dp->devlink_port);
+
+ return err;
}
static void dsa_port_teardown(struct dsa_port *dp)
@@ -347,7 +346,7 @@ static void dsa_port_teardown(struct dsa_port *dp)
static int dsa_switch_setup(struct dsa_switch *ds)
{
- int err;
+ int err = 0;
/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
* driver and before ops->setup() has run, since the switch drivers and
@@ -365,29 +364,41 @@ static int dsa_switch_setup(struct dsa_switch *ds)
err = devlink_register(ds->devlink, ds->dev);
if (err)
- return err;
+ goto free_devlink;
err = dsa_switch_register_notifier(ds);
if (err)
- return err;
+ goto unregister_devlink;
err = ds->ops->setup(ds);
if (err < 0)
- return err;
+ goto unregister_notifier;
if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
- if (!ds->slave_mii_bus)
- return -ENOMEM;
+ if (!ds->slave_mii_bus) {
+ err = -ENOMEM;
+ goto unregister_notifier;
+ }
dsa_slave_mii_bus_init(ds);
err = mdiobus_register(ds->slave_mii_bus);
if (err < 0)
- return err;
+ goto unregister_notifier;
}
return 0;
+
+unregister_notifier:
+ dsa_switch_unregister_notifier(ds);
+unregister_devlink:
+ devlink_unregister(ds->devlink);
+free_devlink:
+ devlink_free(ds->devlink);
+ ds->devlink = NULL;
+
+ return err;
}
static void dsa_switch_teardown(struct dsa_switch *ds)
@@ -397,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
dsa_switch_unregister_notifier(ds);
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+
if (ds->devlink) {
devlink_unregister(ds->devlink);
devlink_free(ds->devlink);
@@ -409,8 +423,8 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
{
struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port;
- int err;
+ int device, port, i;
+ int err = 0;
for (device = 0; device < DSA_MAX_SWITCHES; device++) {
ds = dst->ds[device];
@@ -419,18 +433,41 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
err = dsa_switch_setup(ds);
if (err)
- return err;
+ goto switch_teardown;
for (port = 0; port < ds->num_ports; port++) {
dp = &ds->ports[port];
err = dsa_port_setup(dp);
if (err)
- return err;
+ goto ports_teardown;
}
}
return 0;
+
+ports_teardown:
+ for (i = 0; i < port; i++)
+ dsa_port_teardown(&ds->ports[i]);
+
+ dsa_switch_teardown(ds);
+
+switch_teardown:
+ for (i = 0; i < device; i++) {
+ ds = dst->ds[i];
+ if (!ds)
+ continue;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = &ds->ports[port];
+
+ dsa_port_teardown(dp);
+ }
+
+ dsa_switch_teardown(ds);
+ }
+
+ return err;
}
static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
@@ -492,17 +529,24 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
err = dsa_tree_setup_switches(dst);
if (err)
- return err;
+ goto teardown_default_cpu;
err = dsa_tree_setup_master(dst);
if (err)
- return err;
+ goto teardown_switches;
dst->setup = true;
pr_info("DSA: tree %d setup\n", dst->index);
return 0;
+
+teardown_switches:
+ dsa_tree_teardown_switches(dst);
+teardown_default_cpu:
+ dsa_tree_teardown_default_cpu(dst);
+
+ return err;
}
static void dsa_tree_teardown(struct dsa_switch_tree *dst)
@@ -543,8 +587,10 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
dst->ds[index] = ds;
err = dsa_tree_setup(dst);
- if (err)
- dsa_tree_remove_switch(dst, index);
+ if (err) {
+ dst->ds[index] = NULL;
+ dsa_tree_put(dst);
+ }
return err;
}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3986cedfafc0..b2be53a13aa0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -159,6 +159,23 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags);
int dsa_port_vid_del(struct dsa_port *dp, u16 vid);
int dsa_port_link_register_of(struct dsa_port *dp);
void dsa_port_link_unregister_of(struct dsa_port *dp);
+void dsa_port_phylink_validate(struct phylink_config *config,
+ unsigned long *supported,
+ struct phylink_link_state *state);
+int dsa_port_phylink_mac_link_state(struct phylink_config *config,
+ struct phylink_link_state *state);
+void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state);
+void dsa_port_phylink_mac_an_restart(struct phylink_config *config);
+void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface);
+void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev);
+extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
/* slave.c */
extern const struct dsa_device_ops notag_netdev_ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 363eab6df51b..d2b65e8dc60c 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -336,9 +336,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
- /* Can be called from dsa_slave_port_obj_add() or
- * dsa_slave_vlan_rx_add_vid()
- */
if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
@@ -354,12 +351,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
.vlan = vlan,
};
- if (vlan->obj.orig_dev && netif_is_bridge_master(vlan->obj.orig_dev))
- return -EOPNOTSUPP;
-
- /* Can be called from dsa_slave_port_obj_del() or
- * dsa_slave_vlan_rx_kill_vid()
- */
if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
@@ -418,6 +409,108 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
return phydev;
}
+void dsa_port_phylink_validate(struct phylink_config *config,
+ unsigned long *supported,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_validate)
+ return;
+
+ ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_validate);
+
+int dsa_port_phylink_mac_link_state(struct phylink_config *config,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Only called for SGMII and 802.3z */
+ if (!ds->ops->phylink_mac_link_state)
+ return -EOPNOTSUPP;
+
+ return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state);
+
+void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_config)
+ return;
+
+ ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config);
+
+void dsa_port_phylink_mac_an_restart(struct phylink_config *config)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_an_restart)
+ return;
+
+ ds->ops->phylink_mac_an_restart(ds, dp->index);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart);
+
+void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct phy_device *phydev = NULL;
+ struct dsa_switch *ds = dp->ds;
+
+ if (dsa_is_user_port(ds, dp->index))
+ phydev = dp->slave->phydev;
+
+ if (!ds->ops->phylink_mac_link_down) {
+ if (ds->ops->adjust_link && phydev)
+ ds->ops->adjust_link(ds, dp->index, phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down);
+
+void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_link_up) {
+ if (ds->ops->adjust_link && phydev)
+ ds->ops->adjust_link(ds, dp->index, phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up);
+
+const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
+ .validate = dsa_port_phylink_validate,
+ .mac_link_state = dsa_port_phylink_mac_link_state,
+ .mac_config = dsa_port_phylink_mac_config,
+ .mac_an_restart = dsa_port_phylink_mac_an_restart,
+ .mac_link_down = dsa_port_phylink_mac_link_down,
+ .mac_link_up = dsa_port_phylink_mac_link_up,
+};
+
static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
{
struct dsa_switch *ds = dp->ds;
@@ -495,8 +588,53 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
return 0;
}
+static int dsa_port_phylink_register(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct device_node *port_dn = dp->dn;
+ int mode, err;
+
+ mode = of_get_phy_mode(port_dn);
+ if (mode < 0)
+ mode = PHY_INTERFACE_MODE_NA;
+
+ dp->pl_config.dev = ds->dev;
+ dp->pl_config.type = PHYLINK_DEV;
+
+ dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn),
+ mode, &dsa_port_phylink_mac_ops);
+ if (IS_ERR(dp->pl)) {
+ pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+ return PTR_ERR(dp->pl);
+ }
+
+ err = phylink_of_phy_connect(dp->pl, port_dn, 0);
+ if (err && err != -ENODEV) {
+ pr_err("could not attach to PHY: %d\n", err);
+ goto err_phy_connect;
+ }
+
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+
+err_phy_connect:
+ phylink_destroy(dp->pl);
+ return err;
+}
+
int dsa_port_link_register_of(struct dsa_port *dp)
{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->adjust_link)
+ return dsa_port_phylink_register(dp);
+
+ dev_warn(ds->dev,
+ "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n");
+
if (of_phy_is_fixed_link(dp->dn))
return dsa_port_fixed_link_register_of(dp);
else
@@ -505,6 +643,16 @@ int dsa_port_link_register_of(struct dsa_port *dp)
void dsa_port_link_unregister_of(struct dsa_port *dp)
{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->adjust_link) {
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+ phylink_destroy(dp->pl);
+ return;
+ }
+
if (of_phy_is_fixed_link(dp->dn))
of_phy_deregister_fixed_link(dp->dn);
else
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8157be7e162d..99673f6b07f6 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -22,7 +22,7 @@
#include "dsa_priv.h"
-static bool dsa_slave_dev_check(struct net_device *dev);
+static bool dsa_slave_dev_check(const struct net_device *dev);
/* slave mii_bus handling ***************************************************/
static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
@@ -311,7 +311,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
static int dsa_slave_port_obj_add(struct net_device *dev,
const struct switchdev_obj *obj,
- struct switchdev_trans *trans)
+ struct switchdev_trans *trans,
+ struct netlink_ext_ack *extack)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
int err;
@@ -323,6 +324,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
break;
case SWITCHDEV_OBJ_ID_HOST_MDB:
@@ -333,6 +336,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
trans);
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj),
trans);
break;
@@ -352,6 +357,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_HOST_MDB:
@@ -361,6 +368,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
break;
default:
@@ -423,6 +432,8 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
if (!clone)
return;
+ DSA_SKB_CB(skb)->clone = clone;
+
if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type))
return;
@@ -460,6 +471,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
u64_stats_update_end(&s->syncp);
DSA_SKB_CB(skb)->deferred_xmit = false;
+ DSA_SKB_CB(skb)->clone = NULL;
/* Identify PTP protocol packets, clone them, and pass them to the
* switch driver
@@ -1160,98 +1172,6 @@ static struct device_type dsa_type = {
.name = "dsa",
};
-static void dsa_slave_phylink_validate(struct net_device *dev,
- unsigned long *supported,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_validate)
- return;
-
- ds->ops->phylink_validate(ds, dp->index, supported, state);
-}
-
-static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- /* Only called for SGMII and 802.3z */
- if (!ds->ops->phylink_mac_link_state)
- return -EOPNOTSUPP;
-
- return ds->ops->phylink_mac_link_state(ds, dp->index, state);
-}
-
-static void dsa_slave_phylink_mac_config(struct net_device *dev,
- unsigned int mode,
- const struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_config)
- return;
-
- ds->ops->phylink_mac_config(ds, dp->index, mode, state);
-}
-
-static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_an_restart)
- return;
-
- ds->ops->phylink_mac_an_restart(ds, dp->index);
-}
-
-static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_down) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
-}
-
-static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface,
- struct phy_device *phydev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_up) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
-}
-
-static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
- .validate = dsa_slave_phylink_validate,
- .mac_link_state = dsa_slave_phylink_mac_link_state,
- .mac_config = dsa_slave_phylink_mac_config,
- .mac_an_restart = dsa_slave_phylink_mac_an_restart,
- .mac_link_down = dsa_slave_phylink_mac_link_down,
- .mac_link_up = dsa_slave_phylink_mac_link_up,
-};
-
void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
{
const struct dsa_port *dp = dsa_to_port(ds, port);
@@ -1299,8 +1219,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
- dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
- &dsa_slave_phylink_mac_ops);
+ dp->pl_config.dev = &slave_dev->dev;
+ dp->pl_config.type = PHYLINK_NETDEV;
+
+ dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode,
+ &dsa_port_phylink_mac_ops);
if (IS_ERR(dp->pl)) {
netdev_err(slave_dev,
"error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
@@ -1494,7 +1417,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
free_netdev(slave_dev);
}
-static bool dsa_slave_dev_check(struct net_device *dev)
+static bool dsa_slave_dev_check(const struct net_device *dev)
{
return dev->netdev_ops == &dsa_slave_netdev_ops;
}
@@ -1565,19 +1488,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
return NOTIFY_DONE;
}
-static int
-dsa_slave_switchdev_port_attr_set_event(struct net_device *netdev,
- struct switchdev_notifier_port_attr_info *port_attr_info)
-{
- int err;
-
- err = dsa_slave_port_attr_set(netdev, port_attr_info->attr,
- port_attr_info->trans);
-
- port_attr_info->handled = true;
- return notifier_from_errno(err);
-}
-
struct dsa_switchdev_event_work {
struct work_struct work;
struct switchdev_notifier_fdb_info fdb_info;
@@ -1652,13 +1562,18 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
struct dsa_switchdev_event_work *switchdev_work;
+ int err;
+
+ if (event == SWITCHDEV_PORT_ATTR_SET) {
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_attr_set);
+ return notifier_from_errno(err);
+ }
if (!dsa_slave_dev_check(dev))
return NOTIFY_DONE;
- if (event == SWITCHDEV_PORT_ATTR_SET)
- return dsa_slave_switchdev_port_attr_set_event(dev, ptr);
-
switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;
@@ -1688,41 +1603,28 @@ err_fdb_work_init:
return NOTIFY_BAD;
}
-static int
-dsa_slave_switchdev_port_obj_event(unsigned long event,
- struct net_device *netdev,
- struct switchdev_notifier_port_obj_info *port_obj_info)
-{
- int err = -EOPNOTSUPP;
-
- switch (event) {
- case SWITCHDEV_PORT_OBJ_ADD:
- err = dsa_slave_port_obj_add(netdev, port_obj_info->obj,
- port_obj_info->trans);
- break;
- case SWITCHDEV_PORT_OBJ_DEL:
- err = dsa_slave_port_obj_del(netdev, port_obj_info->obj);
- break;
- }
-
- port_obj_info->handled = true;
- return notifier_from_errno(err);
-}
-
static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
+ int err;
switch (event) {
- case SWITCHDEV_PORT_OBJ_ADD: /* fall through */
+ case SWITCHDEV_PORT_OBJ_ADD:
+ err = switchdev_handle_port_obj_add(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_obj_add);
+ return notifier_from_errno(err);
case SWITCHDEV_PORT_OBJ_DEL:
- return dsa_slave_switchdev_port_obj_event(event, dev, ptr);
+ err = switchdev_handle_port_obj_del(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_obj_del);
+ return notifier_from_errno(err);
case SWITCHDEV_PORT_ATTR_SET:
- return dsa_slave_switchdev_port_attr_set_event(dev, ptr);
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_attr_set);
+ return notifier_from_errno(err);
}
return NOTIFY_DONE;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 65a35e976d7b..6ebbd799c4eb 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
}
EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
- struct packet_type *pt, u16 *tpid, u16 *tci)
+/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
+ * tag, after tpid and before tci. This is because so far, ETH_HLEN
+ * (DMAC, SMAC, EtherType) bytes were pulled.
+ * There are 2 bytes of VLAN tag left in skb->data, and upper
+ * layers expect the 'real' EtherType to be consumed as well.
+ * Coincidentally, a VLAN header is also of the same size as
+ * the number of bytes that need to be pulled.
+ *
+ * skb_mac_header skb->data
+ * | |
+ * v v
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * | Destination MAC | Source MAC | TPID | TCI | EType |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * ^ | |
+ * |<--VLAN_HLEN-->to <---VLAN_HLEN--->
+ * from |
+ * >>>>>>> v
+ * >>>>>>> | | | | | | | | | | | | | | |
+ * >>>>>>> +-----------------------+-----------------------+-------+
+ * >>>>>>> | Destination MAC | Source MAC | EType |
+ * +-----------------------+-----------------------+-------+
+ * ^ ^
+ * (now part of | |
+ * skb->head) skb_mac_header skb->data
+ */
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
{
- struct vlan_ethhdr *tag;
-
- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
- return NULL;
+ u8 *from = skb_mac_header(skb);
+ u8 *dest = from + VLAN_HLEN;
- tag = vlan_eth_hdr(skb);
- *tpid = ntohs(tag->h_vlan_proto);
- *tci = ntohs(tag->h_vlan_TCI);
-
- /* skb->data points in the middle of the VLAN tag,
- * after tpid and before tci. This is because so far,
- * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled.
- * There are 2 bytes of VLAN tag left in skb->data, and upper
- * layers expect the 'real' EtherType to be consumed as well.
- * Coincidentally, a VLAN header is also of the same size as
- * the number of bytes that need to be pulled.
- */
- skb_pull_rcsum(skb, VLAN_HLEN);
+ memmove(dest, from, ETH_HLEN - VLAN_HLEN);
+ skb_pull(skb, VLAN_HLEN);
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ skb_pull_rcsum(skb, ETH_HLEN);
return skb;
}
-EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
+EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
static const struct dsa_device_ops dsa_8021q_netdev_ops = {
.name = "8021q",
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index d43737e6c3fb..1d96c9d4a8e9 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -13,6 +13,8 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
const struct ethhdr *hdr = eth_hdr(skb);
u64 dmac = ether_addr_to_u64(hdr->h_dest);
+ if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META)
+ return false;
if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) ==
SJA1105_LINKLOCAL_FILTER_A)
return true;
@@ -22,15 +24,61 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
return false;
}
+struct sja1105_meta {
+ u64 tstamp;
+ u64 dmac_byte_4;
+ u64 dmac_byte_3;
+ u64 source_port;
+ u64 switch_id;
+};
+
+static void sja1105_meta_unpack(const struct sk_buff *skb,
+ struct sja1105_meta *meta)
+{
+ u8 *buf = skb_mac_header(skb) + ETH_HLEN;
+
+ /* UM10944.pdf section 4.2.17 AVB Parameters:
+ * Structure of the meta-data follow-up frame.
+ * It is in network byte order, so there are no quirks
+ * while unpacking the meta frame.
+ *
+ * Also SJA1105 E/T only populates bits 23:0 of the timestamp
+ * whereas P/Q/R/S does 32 bits. Since the structure is the
+ * same and the E/T puts zeroes in the high-order byte, use
+ * a unified unpacking command for both device series.
+ */
+ packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0);
+ packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0);
+ packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0);
+ packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0);
+ packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0);
+}
+
+static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 smac = ether_addr_to_u64(hdr->h_source);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (smac != SJA1105_META_SMAC)
+ return false;
+ if (dmac != SJA1105_META_DMAC)
+ return false;
+ if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+ return false;
+ return true;
+}
+
/* This is the first time the tagger sees the frame on RX.
- * Figure out if we can decode it, and if we can, annotate skb->cb with how we
- * plan to do that, so we don't need to check again in the rcv function.
+ * Figure out if we can decode it.
*/
static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev)
{
+ if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
+ return true;
if (sja1105_is_link_local(skb))
return true;
- if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
+ if (sja1105_is_meta_frame(skb))
return true;
return false;
}
@@ -62,25 +110,152 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
((pcp << VLAN_PRIO_SHIFT) | tx_vid));
}
+static void sja1105_transfer_meta(struct sk_buff *skb,
+ const struct sja1105_meta *meta)
+{
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ hdr->h_dest[3] = meta->dmac_byte_3;
+ hdr->h_dest[4] = meta->dmac_byte_4;
+ SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+ struct sja1105_meta *meta,
+ bool is_link_local,
+ bool is_meta)
+{
+ struct sja1105_port *sp;
+ struct dsa_port *dp;
+
+ dp = dsa_slave_to_port(skb->dev);
+ sp = dp->priv;
+
+ /* Step 1: A timestampable frame was received.
+ * Buffer it until we get its meta frame.
+ */
+ if (is_link_local && sp->data->hwts_rx_en) {
+ spin_lock(&sp->data->meta_lock);
+ /* Was this a link-local frame instead of the meta
+ * that we were expecting?
+ */
+ if (sp->data->stampable_skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Expected meta frame, is %12llx "
+ "in the DSA master multicast filter?\n",
+ SJA1105_META_DMAC);
+ }
+
+ /* Hold a reference to avoid dsa_switch_rcv
+ * from freeing the skb.
+ */
+ sp->data->stampable_skb = skb_get(skb);
+ spin_unlock(&sp->data->meta_lock);
+
+ /* Tell DSA we got nothing */
+ return NULL;
+
+ /* Step 2: The meta frame arrived.
+ * Time to take the stampable skb out of the closet, annotate it
+ * with the partial timestamp, and pretend that we received it
+ * just now (basically masquerade the buffered frame as the meta
+ * frame, which serves no further purpose).
+ */
+ } else if (is_meta) {
+ struct sk_buff *stampable_skb;
+
+ spin_lock(&sp->data->meta_lock);
+
+ stampable_skb = sp->data->stampable_skb;
+ sp->data->stampable_skb = NULL;
+
+ /* Was this a meta frame instead of the link-local
+ * that we were expecting?
+ */
+ if (!stampable_skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Unexpected meta frame\n");
+ spin_unlock(&sp->data->meta_lock);
+ return NULL;
+ }
+
+ if (stampable_skb->dev != skb->dev) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Meta frame on wrong port\n");
+ spin_unlock(&sp->data->meta_lock);
+ return NULL;
+ }
+
+ /* Free the meta frame and give DSA the buffered stampable_skb
+ * for further processing up the network stack.
+ */
+ kfree_skb(skb);
+
+ skb = skb_copy(stampable_skb, GFP_ATOMIC);
+ if (!skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Failed to copy stampable skb\n");
+ return NULL;
+ }
+ sja1105_transfer_meta(skb, meta);
+ /* The cached copy will be freed now */
+ skb_unref(stampable_skb);
+
+ spin_unlock(&sp->data->meta_lock);
+ }
+
+ return skb;
+}
+
static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
struct net_device *netdev,
struct packet_type *pt)
{
- struct ethhdr *hdr = eth_hdr(skb);
- u64 source_port, switch_id;
- struct sk_buff *nskb;
+ struct sja1105_meta meta = {0};
+ int source_port, switch_id;
+ struct vlan_ethhdr *hdr;
u16 tpid, vid, tci;
+ bool is_link_local;
bool is_tagged;
+ bool is_meta;
- nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci);
- is_tagged = (nskb && tpid == ETH_P_SJA1105);
-
- skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
- vid = tci & VLAN_VID_MASK;
+ hdr = vlan_eth_hdr(skb);
+ tpid = ntohs(hdr->h_vlan_proto);
+ is_tagged = (tpid == ETH_P_SJA1105);
+ is_link_local = sja1105_is_link_local(skb);
+ is_meta = sja1105_is_meta_frame(skb);
skb->offload_fwd_mark = 1;
- if (sja1105_is_link_local(skb)) {
+ if (is_tagged) {
+ /* Normal traffic path. */
+ tci = ntohs(hdr->h_vlan_TCI);
+ vid = tci & VLAN_VID_MASK;
+ source_port = dsa_8021q_rx_source_port(vid);
+ switch_id = dsa_8021q_rx_switch_id(vid);
+ skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ } else if (is_link_local) {
/* Management traffic path. Switch embeds the switch ID and
* port ID into bytes of the destination MAC, courtesy of
* the incl_srcpt options.
@@ -90,10 +265,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
/* Clear the DMAC bytes that were mangled by the switch */
hdr->h_dest[3] = 0;
hdr->h_dest[4] = 0;
+ } else if (is_meta) {
+ sja1105_meta_unpack(skb, &meta);
+ source_port = meta.source_port;
+ switch_id = meta.switch_id;
} else {
- /* Normal traffic path. */
- source_port = dsa_8021q_rx_source_port(vid);
- switch_id = dsa_8021q_rx_switch_id(vid);
+ return NULL;
}
skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
@@ -106,10 +283,10 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
* it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
*/
if (is_tagged)
- memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN,
- ETH_HLEN - VLAN_HLEN);
+ skb = dsa_8021q_remove_header(skb);
- return skb;
+ return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+ is_meta);
}
static struct dsa_device_ops sja1105_netdev_ops = {
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 20b0bcb7e9e3..17374afee28f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -545,17 +545,10 @@ unsigned char * __weak arch_get_platform_mac_address(void)
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
- const unsigned char *addr;
- struct device_node *dp;
+ const unsigned char *addr = NULL;
- if (dev_is_pci(dev))
- dp = pci_device_to_OF_node(to_pci_dev(dev));
- else
- dp = dev->of_node;
-
- addr = NULL;
- if (dp)
- addr = of_get_mac_address(dp);
+ if (dev->of_node)
+ addr = of_get_mac_address(dev->of_node);
if (IS_ERR_OR_NULL(addr))
addr = arch_get_platform_mac_address();
@@ -563,6 +556,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
return -ENODEV;
ether_addr_copy(mac_addr, addr);
+
return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 9133e3cede77..e4aba5d485be 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -74,7 +74,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
key.src = *src;
key.dst = *dst;
- q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+ q = inet_frag_find(ieee802154_lowpan->fqdir, &key);
if (!q)
return NULL;
@@ -134,7 +134,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
fq->q.flags |= INET_FRAG_FIRST_IN;
fq->q.meat += skb->len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
@@ -321,23 +321,18 @@ err:
static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname = "6lowpanfrag_high_thresh",
- .data = &init_net.ieee802154_lowpan.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
},
{
.procname = "6lowpanfrag_low_thresh",
- .data = &init_net.ieee802154_lowpan.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
},
{
.procname = "6lowpanfrag_time",
- .data = &init_net.ieee802154_lowpan.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -372,17 +367,17 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
if (table == NULL)
goto err_alloc;
- table[0].data = &ieee802154_lowpan->frags.high_thresh;
- table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
- table[1].data = &ieee802154_lowpan->frags.low_thresh;
- table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
- table[2].data = &ieee802154_lowpan->frags.timeout;
-
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
table[0].procname = NULL;
}
+ table[0].data = &ieee802154_lowpan->fqdir->high_thresh;
+ table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].data = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh;
+ table[2].data = &ieee802154_lowpan->fqdir->timeout;
+
hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table);
if (hdr == NULL)
goto err_reg;
@@ -449,32 +444,42 @@ static int __net_init lowpan_frags_init_net(struct net *net)
net_ieee802154_lowpan(net);
int res;
- ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
- ieee802154_lowpan->frags.f = &lowpan_frags;
- res = inet_frags_init_net(&ieee802154_lowpan->frags);
+ res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
if (res < 0)
return res;
+
+ ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
return res;
}
+static void __net_exit lowpan_frags_pre_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ fqdir_pre_exit(ieee802154_lowpan->fqdir);
+}
+
static void __net_exit lowpan_frags_exit_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
lowpan_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
}
static struct pernet_operations lowpan_frags_ops = {
- .init = lowpan_frags_init_net,
- .exit = lowpan_frags_exit_net,
+ .init = lowpan_frags_init_net,
+ .pre_exit = lowpan_frags_pre_exit_net,
+ .exit = lowpan_frags_exit_net,
};
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
@@ -539,7 +544,7 @@ err_sysctl:
void lowpan_net_frag_exit(void)
{
- inet_frags_fini(&lowpan_frags);
lowpan_frags_sysctl_unregister();
unregister_pernet_subsys(&lowpan_frags_ops);
+ inet_frags_fini(&lowpan_frags);
}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 000a61994c8f..d57ecfaf89d4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
- metrics.o netlink.o
+ metrics.o netlink.o nexthop.o
obj-$(CONFIG_BPFILTER) += bpfilter/
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c6bd0f7a020a..137d1892395d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -190,7 +190,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
static int devinet_sysctl_register(struct in_device *idev);
@@ -296,8 +297,8 @@ static void in_dev_rcu_put(struct rcu_head *head)
static void inetdev_destroy(struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
struct net_device *dev;
+ struct in_ifaddr *ifa;
ASSERT_RTNL();
@@ -307,7 +308,7 @@ static void inetdev_destroy(struct in_device *in_dev)
ip_mc_destroy_dev(in_dev);
- while ((ifa = in_dev->ifa_list) != NULL) {
+ while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
inet_free_ifa(ifa);
}
@@ -323,30 +324,35 @@ static void inetdev_destroy(struct in_device *in_dev)
int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
+ const struct in_ifaddr *ifa;
+
rcu_read_lock();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
- } endfor_ifa(in_dev);
+ }
rcu_read_unlock();
return 0;
}
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 portid)
+static void __inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
- struct in_ifaddr *ifa, *ifa1 = *ifap;
- struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *ifa, *ifa1;
+ struct in_ifaddr *last_prim;
struct in_ifaddr *prev_prom = NULL;
int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
+ ifa1 = rtnl_dereference(*ifap);
+ last_prim = rtnl_dereference(in_dev->ifa_list);
if (in_dev->dead)
goto no_promotions;
@@ -355,9 +361,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
**/
if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
- struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+ struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
- while ((ifa = *ifap1) != NULL) {
+ while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
ifa1->ifa_scope <= ifa->ifa_scope)
last_prim = ifa;
@@ -390,7 +396,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
* and later to add them back with new prefsrc. Do this
* while all addresses are on the device list.
*/
- for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa))
fib_del_ifaddr(ifa, ifa1);
@@ -416,19 +422,25 @@ no_promotions:
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
- struct in_ifaddr *next_sec = promote->ifa_next;
+ struct in_ifaddr *next_sec;
+ next_sec = rtnl_dereference(promote->ifa_next);
if (prev_prom) {
- prev_prom->ifa_next = promote->ifa_next;
- promote->ifa_next = last_prim->ifa_next;
- last_prim->ifa_next = promote;
+ struct in_ifaddr *last_sec;
+
+ rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+
+ last_sec = rtnl_dereference(last_prim->ifa_next);
+ rcu_assign_pointer(promote->ifa_next, last_sec);
+ rcu_assign_pointer(last_prim->ifa_next, promote);
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa;
+ ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -440,7 +452,8 @@ no_promotions:
inet_free_ifa(ifa1);
}
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy)
{
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
@@ -453,9 +466,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid, struct netlink_ext_ack *extack)
{
+ struct in_ifaddr __rcu **last_primary, **ifap;
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap, **last_primary;
struct in_validator_info ivi;
+ struct in_ifaddr *ifa1;
int ret;
ASSERT_RTNL();
@@ -468,8 +482,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifa->ifa_flags &= ~IFA_F_SECONDARY;
last_primary = &in_dev->ifa_list;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ ifap = &in_dev->ifa_list;
+ ifa1 = rtnl_dereference(*ifap);
+
+ while (ifa1) {
if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
ifa->ifa_scope <= ifa1->ifa_scope)
last_primary = &ifa1->ifa_next;
@@ -485,6 +501,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
ifa->ifa_flags |= IFA_F_SECONDARY;
}
+
+ ifap = &ifa1->ifa_next;
+ ifa1 = rtnl_dereference(*ifap);
}
/* Allow any devices that wish to register ifaddr validtors to weigh
@@ -510,8 +529,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifap = last_primary;
}
- ifa->ifa_next = *ifap;
- *ifap = ifa;
+ rcu_assign_pointer(ifa->ifa_next, *ifap);
+ rcu_assign_pointer(*ifap, ifa);
inet_hash_insert(dev_net(in_dev->dev), ifa);
@@ -576,12 +595,14 @@ EXPORT_SYMBOL(inetdev_by_index);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
{
+ struct in_ifaddr *ifa;
+
ASSERT_RTNL();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
- } endfor_ifa(in_dev);
+ }
return NULL;
}
@@ -609,10 +630,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct in_ifaddr __rcu **ifap;
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
- struct in_ifaddr *ifa, **ifap;
+ struct in_ifaddr *ifa;
+
int err = -EINVAL;
ASSERT_RTNL();
@@ -629,7 +652,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
@@ -717,15 +740,19 @@ static void check_lifetime(struct work_struct *work)
if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
age >= ifa->ifa_valid_lft) {
- struct in_ifaddr **ifap;
+ struct in_ifaddr __rcu **ifap;
+ struct in_ifaddr *tmp;
- for (ifap = &ifa->ifa_dev->ifa_list;
- *ifap != NULL; ifap = &(*ifap)->ifa_next) {
- if (*ifap == ifa) {
+ ifap = &ifa->ifa_dev->ifa_list;
+ tmp = rtnl_dereference(*ifap);
+ while (tmp) {
+ if (tmp == ifa) {
inet_del_ifa(ifa->ifa_dev,
ifap, 1);
break;
}
+ ifap = &tmp->ifa_next;
+ tmp = rtnl_dereference(*ifap);
}
} else if (ifa->ifa_preferred_lft !=
INFINITY_LIFE_TIME &&
@@ -869,13 +896,12 @@ errout:
static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap;
+ struct in_ifaddr *ifa1;
if (!ifa->ifa_local)
return NULL;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
@@ -970,8 +996,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
struct sockaddr_in sin_orig;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+ struct in_ifaddr __rcu **ifap = NULL;
struct in_device *in_dev;
- struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
struct net_device *dev;
char *colon;
@@ -1042,7 +1068,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
/* note: we only do this for a limited set of ioctls
and only if the original address family was AF_INET.
This is checked above. */
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
@@ -1055,7 +1083,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
4.3BSD-style and passed in junk so we fall back to
comparing just the label */
if (!ifa) {
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next)
if (!strcmp(ifr->ifr_name, ifa->ifa_label))
break;
@@ -1204,7 +1233,7 @@ out:
static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
struct in_device *in_dev = __in_dev_get_rtnl(dev);
- struct in_ifaddr *ifa;
+ const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
@@ -1214,7 +1243,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s
if (!in_dev)
goto out;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1242,18 +1271,24 @@ out:
static __be32 in_dev_select_addr(const struct in_device *in_dev,
int scope)
{
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
return 0;
}
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
struct net *net = dev_net(dev);
int master_idx;
@@ -1263,8 +1298,13 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (!in_dev)
goto no_in_dev;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope > scope)
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+ if (min(ifa->ifa_scope, localnet_scope) > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
addr = ifa->ifa_local;
@@ -1272,7 +1312,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
if (addr)
goto out_unlock;
@@ -1317,13 +1357,20 @@ EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
{
- int same = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ int same = 0;
+
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);
- for_ifa(in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
- ifa->ifa_scope <= scope) {
+ min_scope <= scope) {
addr = ifa->ifa_local;
if (same)
break;
@@ -1338,7 +1385,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
if (inet_ifa_match(addr, ifa))
break;
/* No, then can we use new local src? */
- if (ifa->ifa_scope <= scope) {
+ if (min_scope <= scope) {
addr = ifa->ifa_local;
break;
}
@@ -1346,7 +1393,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
same = 0;
}
}
- } endfor_ifa(in_dev);
+ }
return same ? addr : 0;
}
@@ -1420,7 +1467,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1450,10 +1497,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
+ const struct in_ifaddr *ifa;
- for (ifa = in_dev->ifa_list; ifa;
- ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_send(ARPOP_REQUEST, ETH_P_ARP,
ifa->ifa_local, dev,
ifa->ifa_local, NULL,
@@ -1723,15 +1769,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
int ip_idx = 0;
int err;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ if (ip_idx < s_ip_idx) {
+ ip_idx++;
continue;
-
+ }
err = inet_fill_ifaddr(skb, ifa, fillargs);
if (err < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
}
err = 0;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e54c2bcbb465..317339cd7f03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -39,6 +39,7 @@
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
@@ -188,7 +189,7 @@ int fib_unmerge(struct net *net)
return 0;
}
-static void fib_flush(struct net *net)
+void fib_flush(struct net *net)
{
int flushed = 0;
unsigned int h;
@@ -230,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
if (table) {
ret = RTN_UNICAST;
if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
- if (!dev || dev == res.fi->fib_dev)
+ struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
+
+ if (!dev || dev == nhc->nhc_dev)
ret = res.type;
}
}
@@ -317,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int ret;
- for (ret = 0; ret < fi->fib_nhs; ret++) {
- struct fib_nh *nh = &fi->fib_nh[ret];
+ for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
- if (nh->fib_nh_dev == dev) {
+ if (nhc->nhc_dev == dev) {
dev_match = true;
break;
- } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) {
+ } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) {
dev_match = true;
break;
}
}
#else
- if (fi->fib_nh[0].fib_nh_dev == dev)
+ if (fib_info_nhc(fi, 0)->nhc_dev == dev)
dev_match = true;
#endif
@@ -536,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
- struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+ }
+ rcu_read_unlock();
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -641,6 +652,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
}
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -659,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
@@ -796,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
if (err < 0)
goto errout;
break;
+ case RTA_NH_ID:
+ cfg->fc_nh_id = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ if (cfg->fc_nh_id) {
+ if (cfg->fc_oif || cfg->fc_gw_family ||
+ cfg->fc_encap || cfg->fc_mp) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ return -EINVAL;
}
}
@@ -822,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ err = -EINVAL;
+ goto errout;
+ }
+
tb = fib_get_table(net, cfg.fc_table);
if (!tb) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
@@ -881,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
return -EINVAL;
}
+
if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
return -EINVAL;
}
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ filter->dump_routes = false;
+ else
+ filter->dump_exceptions = false;
filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
filter->flags = rtm->rtm_flags;
@@ -931,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = { .dump_routes = true,
+ .dump_exceptions = true };
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct fib_dump_filter filter = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
@@ -950,8 +987,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
}
- /* fib entries are never clones and ipv4 does not use prefix flag */
- if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
+ /* ipv4 does not use prefix flag */
+ if (filter.flags & RTM_F_PREFIX)
return skb->len;
if (filter.table_id) {
@@ -1172,8 +1209,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
*
* Scan address list to be sure that addresses are really gone.
*/
-
- for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1241,6 +1278,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
}
}
+ rcu_read_unlock();
no_promotions:
if (!(ok & BRD_OK))
@@ -1410,6 +1448,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ struct in_ifaddr *ifa;
unsigned int flags;
if (event == NETDEV_UNREGISTER) {
@@ -1424,9 +1463,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
switch (event) {
case NETDEV_UP:
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
- } endfor_ifa(in_dev);
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
#endif
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 7945f0534db7..a68b5e21ec51 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a38e86b98e4f..b43a7ba5c6a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -27,6 +27,7 @@
#include <net/route.h>
#include <net/tcp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/fib_rules.h>
struct fib4_rule {
@@ -141,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
struct fib_result *result = (struct fib_result *) arg->result;
struct net_device *dev = NULL;
- if (result->fi)
- dev = result->fi->fib_dev;
+ if (result->fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
+
+ dev = nhc->nhc_dev;
+ }
/* do not accept result if the route does
* not meet the required prefix length
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index bfa49a88d03a..2db089e10ba0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -38,6 +38,7 @@
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
+#include <net/nexthop.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
@@ -56,18 +57,21 @@ static unsigned int fib_info_cnt;
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
#ifdef CONFIG_IP_ROUTE_MULTIPATH
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
for (nhsel = 0, nh = (fi)->fib_nh; \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nh++, nhsel++)
#define change_nexthops(fi) { \
int nhsel; struct fib_nh *nexthop_nh; \
for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -228,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
- change_nexthops(fi) {
- fib_nh_release(fi->fib_net, nexthop_nh);
- } endfor_nexthops(fi);
+ if (fi->nh) {
+ nexthop_put(fi->nh);
+ } else {
+ change_nexthops(fi) {
+ fib_nh_release(fi->fib_net, nexthop_nh);
+ } endfor_nexthops(fi);
+ }
ip_fib_metrics_put(fi->fib_metrics);
@@ -256,22 +264,34 @@ void fib_release_info(struct fib_info *fi)
hlist_del(&fi->fib_hash);
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
- change_nexthops(fi) {
- if (!nexthop_nh->fib_nh_dev)
- continue;
- hlist_del(&nexthop_nh->nh_hash);
- } endfor_nexthops(fi)
+ if (fi->nh) {
+ list_del(&fi->nh_list);
+ } else {
+ change_nexthops(fi) {
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hlist_del(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ }
fi->fib_dead = 1;
fib_info_put(fi);
}
spin_unlock_bh(&fib_info_lock);
}
-static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
- const struct fib_nh *onh = ofi->fib_nh;
+ const struct fib_nh *onh;
+
+ if (fi->nh || ofi->nh)
+ return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+ if (ofi->fib_nhs == 0)
+ return 0;
for_nexthops(fi) {
+ onh = fib_info_nh(ofi, nhsel);
+
if (nh->fib_nh_oif != onh->fib_nh_oif ||
nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
nh->fib_nh_scope != onh->fib_nh_scope ||
@@ -292,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
if (nh->fib_nh_gw_family == AF_INET6 &&
ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
return -1;
-
- onh++;
} endfor_nexthops(fi);
return 0;
}
@@ -307,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}
-static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
+ u32 prefsrc, u32 priority)
{
- unsigned int mask = (fib_info_hash_size - 1);
- unsigned int val = fi->fib_nhs;
+ unsigned int val = init_val;
- val ^= (fi->fib_protocol << 8) | fi->fib_scope;
- val ^= (__force u32)fi->fib_prefsrc;
- val ^= fi->fib_priority;
- for_nexthops(fi) {
- val ^= fib_devindex_hashfn(nh->fib_nh_oif);
- } endfor_nexthops(fi)
+ val ^= (protocol << 8) | scope;
+ val ^= prefsrc;
+ val ^= priority;
+
+ return val;
+}
+
+static unsigned int fib_info_hashfn_result(unsigned int val)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}
-static struct fib_info *fib_find_info(const struct fib_info *nfi)
+static inline unsigned int fib_info_hashfn(struct fib_info *fi)
+{
+ unsigned int val;
+
+ val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
+ fi->fib_scope, (__force u32)fi->fib_prefsrc,
+ fi->fib_priority);
+
+ if (fi->nh) {
+ val ^= fib_devindex_hashfn(fi->nh->id);
+ } else {
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->fib_nh_oif);
+ } endfor_nexthops(fi)
+ }
+
+ return fib_info_hashfn_result(val);
+}
+
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+ const struct fib_config *cfg)
+{
+ struct hlist_head *head;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id),
+ cfg->fc_protocol, cfg->fc_scope,
+ (__force u32)cfg->fc_prefsrc,
+ cfg->fc_priority);
+ hash = fib_info_hashfn_result(hash);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!net_eq(fi->fib_net, net))
+ continue;
+ if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
+ continue;
+ if (cfg->fc_protocol == fi->fib_protocol &&
+ cfg->fc_scope == fi->fib_scope &&
+ cfg->fc_prefsrc == fi->fib_prefsrc &&
+ cfg->fc_priority == fi->fib_priority &&
+ cfg->fc_type == fi->fib_type &&
+ cfg->fc_table == fi->fib_tb_id &&
+ !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static struct fib_info *fib_find_info(struct fib_info *nfi)
{
struct hlist_head *head;
struct fib_info *fi;
@@ -344,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
!((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
- (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ nh_comp(fi, nfi) == 0)
return fi;
}
@@ -386,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(4) /* RTA_PREFSRC */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+ unsigned int nhs = fib_info_num_path(fi);
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
- if (fi->fib_nhs) {
+ if (fi->nh)
+ payload += nla_total_size(4); /* RTA_NH_ID */
+
+ if (nhs) {
size_t nh_encapsize = 0;
- /* Also handles the special case fib_nhs == 1 */
+ /* Also handles the special case nhs == 1 */
/* each nexthop is packed in an attribute */
size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+ unsigned int i;
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
/* grab encap info */
- for_nexthops(fi) {
- if (nh->fib_nh_lws) {
+ for (i = 0; i < fib_info_num_path(fi); i++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+ if (nhc->nhc_lwtstate) {
/* RTA_ENCAP_TYPE */
nh_encapsize += lwtunnel_get_encap_size(
- nh->fib_nh_lws);
+ nhc->nhc_lwtstate);
/* RTA_ENCAP */
nh_encapsize += nla_total_size(2);
}
- } endfor_nexthops(fi);
+ }
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size((fi->fib_nhs * nhsize) +
- nh_encapsize);
+ payload += nla_total_size((nhs * nhsize) + nh_encapsize);
}
@@ -574,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
return nhs;
}
+/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
struct net *net = fi->fib_net;
struct fib_config fib_cfg;
+ struct fib_nh *nh;
int ret;
change_nexthops(fi) {
@@ -642,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
} endfor_nexthops(fi);
ret = -EINVAL;
- if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) {
+ nh = fib_info_nh(fi, 0);
+ if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
NL_SET_ERR_MSG(extack,
"Nexthop device index does not match RTA_OIF");
goto errout;
}
if (cfg->fc_gw_family) {
- if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
+ if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
(cfg->fc_gw_family == AF_INET &&
- fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) ||
+ nh->fib_nh_gw4 != cfg->fc_gw4) ||
(cfg->fc_gw_family == AF_INET6 &&
- ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) {
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
NL_SET_ERR_MSG(extack,
"Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
goto errout;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
+ if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
NL_SET_ERR_MSG(extack,
"Nexthop class id does not match RTA_FLOW");
goto errout;
@@ -670,12 +753,13 @@ errout:
return ret;
}
+/* only called when fib_nh is integrated into fib_info */
static void fib_rebalance(struct fib_info *fi)
{
int total;
int w;
- if (fi->fib_nhs < 2)
+ if (fib_info_num_path(fi) < 2)
return;
total = 0;
@@ -756,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
return 1;
+ if (cfg->fc_nh_id) {
+ if (fi->nh && cfg->fc_nh_id == fi->nh->id)
+ return 0;
+ return 1;
+ }
+
if (cfg->fc_oif || cfg->fc_gw_family) {
+ struct fib_nh *nh = fib_info_nh(fi, 0);
+
if (cfg->fc_encap) {
if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- fi->fib_nh, cfg, extack))
+ nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow &&
- cfg->fc_flow != fi->fib_nh->nh_tclassid)
+ cfg->fc_flow != nh->nh_tclassid)
return 1;
#endif
- if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
+ if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
(cfg->fc_gw_family &&
- cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
+ cfg->fc_gw_family != nh->fib_nh_gw_family))
return 1;
if (cfg->fc_gw_family == AF_INET &&
- cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
+ cfg->fc_gw4 != nh->fib_nh_gw4)
return 1;
if (cfg->fc_gw_family == AF_INET6 &&
- ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6))
+ ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
return 1;
return 0;
@@ -1088,15 +1180,13 @@ out:
return err;
}
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
- struct netlink_ext_ack *extack)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+ struct netlink_ext_ack *extack)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
- u32 table = cfg->fc_table;
int err;
if (nh->fib_nh_gw_family == AF_INET)
- err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+ err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
else if (nh->fib_nh_gw_family == AF_INET6)
err = fib_check_nh_v6_gw(net, nh, table, extack);
else
@@ -1187,11 +1277,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
fib_info_hash_free(old_laddrhash, bytes);
}
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+ unsigned char scope)
{
- nh->nh_saddr = inet_select_addr(nh->fib_nh_dev,
- nh->fib_nh_gw4,
- nh->nh_parent->fib_scope);
+ struct fib_nh *nh;
+
+ if (nhc->nhc_family != AF_INET)
+ return inet_select_addr(nhc->nhc_dev, 0, scope);
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
return nh->nh_saddr;
@@ -1200,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
{
struct fib_nh_common *nhc = res->nhc;
- struct fib_nh *nh;
if (res->fi->fib_prefsrc)
return res->fi->fib_prefsrc;
- nh = container_of(nhc, struct fib_nh, nh_common);
- if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
- return nh->nh_saddr;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
+ return nh->nh_saddr;
+ }
- return fib_info_update_nh_saddr(net, nh);
+ return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
}
static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
@@ -1241,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
{
int err;
struct fib_info *fi = NULL;
+ struct nexthop *nh = NULL;
struct fib_info *ofi;
int nhs = 1;
struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1260,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
+ if (cfg->fc_nh_id) {
+ if (!cfg->fc_mx) {
+ fi = fib_find_info_nh(net, cfg);
+ if (fi) {
+ fi->fib_treeref++;
+ return fi;
+ }
+ }
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto err_inval;
+ }
+ nhs = 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
@@ -1295,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
cfg->fc_mx_len, extack);
- if (unlikely(IS_ERR(fi->fib_metrics))) {
+ if (IS_ERR(fi->fib_metrics)) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
return ERR_PTR(err);
@@ -1312,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
- change_nexthops(fi) {
- nexthop_nh->nh_parent = fi;
- } endfor_nexthops(fi)
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = 0;
+ fi->nh = nh;
+ }
+ } else {
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ } endfor_nexthops(fi)
- if (cfg->fc_mp)
- err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
- else
- err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+ if (cfg->fc_mp)
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+ extack);
+ else
+ err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+ }
if (err != 0)
goto failure;
@@ -1350,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- if (cfg->fc_scope == RT_SCOPE_HOST) {
+ if (fi->nh) {
+ err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+ if (err)
+ goto failure;
+ } else if (cfg->fc_scope == RT_SCOPE_HOST) {
struct fib_nh *nh = fi->fib_nh;
/* Local address is added. */
@@ -1365,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
nh->fib_nh_scope = RT_SCOPE_NOWHERE;
- nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif);
+ nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
err = -ENODEV;
if (!nh->fib_nh_dev)
goto failure;
@@ -1373,7 +1504,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int linkdown = 0;
change_nexthops(fi) {
- err = fib_check_nh(cfg, nexthop_nh, extack);
+ err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+ cfg->fc_table, cfg->fc_scope,
+ extack);
if (err != 0)
goto failure;
if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
@@ -1388,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- change_nexthops(fi) {
- fib_info_update_nh_saddr(net, nexthop_nh);
- if (nexthop_nh->fib_nh_gw_family == AF_INET6)
- fi->fib_nh_is_v6 = true;
- } endfor_nexthops(fi)
+ if (!fi->nh) {
+ change_nexthops(fi) {
+ fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+ fi->fib_scope);
+ if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+ fi->fib_nh_is_v6 = true;
+ } endfor_nexthops(fi)
- fib_rebalance(fi);
+ fib_rebalance(fi);
+ }
link_it:
ofi = fib_find_info(fi);
@@ -1416,16 +1552,20 @@ link_it:
head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
hlist_add_head(&fi->fib_lhash, head);
}
- change_nexthops(fi) {
- struct hlist_head *head;
- unsigned int hash;
+ if (fi->nh) {
+ list_add(&fi->nh_list, &nh->fi_list);
+ } else {
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
- if (!nexthop_nh->fib_nh_dev)
- continue;
- hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_add_head(&nexthop_nh->nh_hash, head);
- } endfor_nexthops(fi)
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ }
spin_unlock_bh(&fib_info_lock);
return fi;
@@ -1552,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
if (!mp)
goto nla_put_failure;
+ if (unlikely(fi->nh)) {
+ if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+ goto nla_put_failure;
+ goto mp_end;
+ }
+
for_nexthops(fi) {
if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
goto nla_put_failure;
@@ -1562,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
#endif
} endfor_nexthops(fi);
+mp_end:
nla_nest_end(skb, mp);
return 0;
@@ -1580,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
{
+ unsigned int nhs = fib_info_num_path(fi);
struct nlmsghdr *nlh;
struct rtmsg *rtm;
@@ -1615,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
if (fi->fib_prefsrc &&
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
- if (fi->fib_nhs == 1) {
- struct fib_nh *nh = &fi->fib_nh[0];
+
+ if (fi->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
+ goto nla_put_failure;
+ if (nexthop_is_blackhole(fi->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+ }
+
+ if (nhs == 1) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
unsigned char flags = 0;
- if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
+ if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
goto nla_put_failure;
rtm->rtm_flags = flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (nh->nh_tclassid &&
- nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
- goto nla_put_failure;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
+ }
#endif
} else {
if (fib_add_multipath(skb, fi) < 0)
@@ -1709,7 +1870,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh,
* - if the new MTU is greater than the PMTU, don't make any change
* - otherwise, unlock and set PMTU
*/
-static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
{
struct fnhe_hash_bucket *bucket;
int i;
@@ -1745,7 +1906,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
hlist_for_each_entry(nh, head, nh_hash) {
if (nh->fib_nh_dev == dev)
- nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+ fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
}
}
@@ -1754,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
* NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
* NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ *
+ * only used when fib_nh is built into fib_info
*/
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
@@ -1835,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ struct fib_nh *nh;
if (fa->fa_slen != slen)
continue;
@@ -1856,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
- if (!next_fi->fib_nh[0].fib_nh_gw4 ||
- next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK)
+
+ nh = fib_info_nh(next_fi, 0);
+ if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK)
continue;
fib_alias_accessed(fa);
@@ -1899,6 +2064,8 @@ out:
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
*/
int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
@@ -1993,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash)
struct net *net = fi->fib_net;
bool first = false;
+ if (unlikely(res->fi->nh)) {
+ nexthop_path_fib_result(res, hash);
+ return;
+ }
+
change_nexthops(fi) {
if (net->ipv4.sysctl_fib_multipath_use_neigh) {
if (!fib_good_nh(nexthop_nh))
@@ -2021,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1) {
+ if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
fib_select_multipath(res, h);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 868c74771fa9..4400f5051977 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -338,12 +338,18 @@ static struct tnode *tnode_alloc(int bits)
static inline void empty_child_inc(struct key_vector *n)
{
- ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+ tn_info(n)->empty_children++;
+
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children++;
}
static inline void empty_child_dec(struct key_vector *n)
{
- tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children--;
+
+ tn_info(n)->empty_children--;
}
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
@@ -1449,6 +1455,7 @@ found:
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (unlikely(err < 0)) {
+out_reject:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
@@ -1457,7 +1464,13 @@ found:
}
if (fi->fib_flags & RTNH_F_DEAD)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+
+ if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) {
+ err = fib_props[RTN_BLACKHOLE].error;
+ goto out_reject;
+ }
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
if (nhc->nhc_flags & RTNH_F_DEAD)
@@ -1931,6 +1944,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
return found;
}
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+ struct nl_info *info)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct fib_alias *fa;
+
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ pn = node_parent(pn);
+ cindex = get_index(pkey, pn);
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+ continue;
+
+ rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id,
+ info, NLM_F_REPLACE);
+
+ /* call_fib_entry_notifiers will be removed when
+ * in-kernel notifier is implemented and supported
+ * for nexthop objects
+ */
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ n->key,
+ KEYLENGTH - fa->fa_slen, fa,
+ NULL);
+ }
+ }
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ __fib_info_notify_update(net, tb, info);
+ }
+}
+
static void fib_leaf_notify(struct net *net, struct key_vector *l,
struct fib_table *tb, struct notifier_block *nb)
{
@@ -2006,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
{
unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
+ int i, s_i, i_fa, s_fa, err;
struct fib_alias *fa;
- int i, s_i;
- if (filter->filter_set)
+ if (filter->filter_set ||
+ !filter->dump_exceptions || !filter->dump_routes)
flags |= NLM_F_DUMP_FILTERED;
s_i = cb->args[4];
+ s_fa = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- int err;
+ struct fib_info *fi = fa->fa_info;
if (i < s_i)
goto next;
+ i_fa = 0;
+
if (tb->tb_id != fa->tb_id)
goto next;
@@ -2030,29 +2118,43 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
goto next;
if ((filter->protocol &&
- fa->fa_info->fib_protocol != filter->protocol))
+ fi->fib_protocol != filter->protocol))
goto next;
if (filter->dev &&
- !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+ !fib_info_nh_uses_dev(fi, filter->dev))
goto next;
}
- err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- tb->tb_id, fa->fa_type,
- xkey, KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fa->fa_info, flags);
- if (err < 0) {
- cb->args[4] = i;
- return err;
+ if (filter->dump_routes && !s_fa) {
+ err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ tb->tb_id, fa->fa_type,
+ xkey, KEYLENGTH - fa->fa_slen,
+ fa->fa_tos, fi, flags);
+ if (err < 0)
+ goto stop;
+ i_fa++;
}
+
+ if (filter->dump_exceptions) {
+ err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+ &i_fa, s_fa);
+ if (err < 0)
+ goto stop;
+ }
+
next:
i++;
}
cb->args[4] = i;
return skb->len;
+
+stop:
+ cb->args[4] = i;
+ cb->args[5] = i_fa;
+ return err;
}
/* rcu_read_lock needs to be hold by caller from readside */
@@ -2634,14 +2736,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock();
}
-static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
unsigned int flags = 0;
if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
flags = RTF_REJECT;
- if (fi && fi->fib_nh->fib_nh_gw4)
- flags |= RTF_GATEWAY;
+ if (fi) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+
+ if (nhc->nhc_gw.ipv4)
+ flags |= RTF_GATEWAY;
+ }
if (mask == htonl(0xFFFFFFFF))
flags |= RTF_HOST;
flags |= RTF_UP;
@@ -2672,7 +2778,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
prefix = htonl(l->key);
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- const struct fib_info *fi = fa->fa_info;
+ struct fib_info *fi = fa->fa_info;
__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
@@ -2685,26 +2791,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_setwidth(seq, 127);
- if (fi)
+ if (fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ __be32 gw = 0;
+
+ if (nhc->nhc_gw_family == AF_INET)
+ gw = nhc->nhc_gw.ipv4;
+
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->fib_nh_gw4, flags, 0, 0,
+ nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+ prefix, gw, flags, 0, 0,
fi->fib_priority,
mask,
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
- else
+ } else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
-
+ }
seq_pad(seq, '\n');
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 7c857c72aad1..1510e951f451 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -201,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
*/
static struct sock *icmp_sk(struct net *net)
{
- return *this_cpu_ptr(net->ipv4.icmp_sk);
+ return this_cpu_read(*net->ipv4.icmp_sk);
}
/* Called with BH disabled */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a57f0d69eadb..9a206931a342 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -332,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
const struct flowi4 *fl4)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return htonl(INADDR_ANY);
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
- } endfor_ifa(in_dev);
+ }
return htonl(INADDR_ANY);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7fd6db3fe366..f5c163d4771b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -649,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
EXPORT_SYMBOL(inet_rtx_syn_ack);
/* return true if req was found in the ehash table */
-static bool reqsk_queue_unlink(struct request_sock_queue *queue,
- struct request_sock *req)
+static bool reqsk_queue_unlink(struct request_sock *req)
{
struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
@@ -669,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue,
void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
- if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ if (reqsk_queue_unlink(req)) {
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
reqsk_put(req);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ce6969896f5..d666756be5f1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -106,48 +106,90 @@ int inet_frags_init(struct inet_frags *f)
if (!f->frags_cachep)
return -ENOMEM;
+ refcount_set(&f->refcnt, 1);
+ init_completion(&f->completion);
return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f)
{
- /* We must wait that all inet_frag_destroy_rcu() have completed. */
- rcu_barrier();
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ wait_for_completion(&f->completion);
kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
}
EXPORT_SYMBOL(inet_frags_fini);
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
static void inet_frags_free_cb(void *ptr, void *arg)
{
struct inet_frag_queue *fq = ptr;
+ int count;
- /* If we can not cancel the timer, it means this frag_queue
- * is already disappearing, we have nothing to do.
- * Otherwise, we own a refcount until the end of this function.
- */
- if (!del_timer(&fq->timer))
- return;
+ count = del_timer_sync(&fq->timer) ? 1 : 0;
spin_lock_bh(&fq->lock);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq->flags |= INET_FRAG_COMPLETE;
- refcount_dec(&fq->refcnt);
+ count++;
+ } else if (fq->flags & INET_FRAG_HASH_DEAD) {
+ count++;
}
spin_unlock_bh(&fq->lock);
- inet_frag_put(fq);
+ if (refcount_sub_and_test(count, &fq->refcnt))
+ inet_frag_destroy(fq);
+}
+
+static void fqdir_work_fn(struct work_struct *work)
+{
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+ struct inet_frags *f = fqdir->f;
+
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+ /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+ * have completed, since they need to dereference fqdir.
+ * Would it not be nice to have kfree_rcu_barrier() ? :)
+ */
+ rcu_barrier();
+
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ kfree(fqdir);
}
-void inet_frags_exit_net(struct netns_frags *nf)
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
{
- nf->high_thresh = 0; /* prevent creation of new frags */
+ struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+ int res;
- rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+ if (!fqdir)
+ return -ENOMEM;
+ fqdir->f = f;
+ fqdir->net = net;
+ res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+ if (res < 0) {
+ kfree(fqdir);
+ return res;
+ }
+ refcount_inc(&f->refcnt);
+ *fqdirp = fqdir;
+ return 0;
}
-EXPORT_SYMBOL(inet_frags_exit_net);
+EXPORT_SYMBOL(fqdir_init);
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+ INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+ queue_work(system_wq, &fqdir->destroy_work);
+}
+EXPORT_SYMBOL(fqdir_exit);
void inet_frag_kill(struct inet_frag_queue *fq)
{
@@ -155,11 +197,23 @@ void inet_frag_kill(struct inet_frag_queue *fq)
refcount_dec(&fq->refcnt);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
- struct netns_frags *nf = fq->net;
+ struct fqdir *fqdir = fq->fqdir;
fq->flags |= INET_FRAG_COMPLETE;
- rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
- refcount_dec(&fq->refcnt);
+ rcu_read_lock();
+ /* The RCU read lock provides a memory barrier
+ * guaranteeing that if fqdir->dead is false then
+ * the hash table destruction will not start until
+ * after we unlock. Paired with inet_frags_exit_net().
+ */
+ if (!fqdir->dead) {
+ rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+ fqdir->f->rhash_params);
+ refcount_dec(&fq->refcnt);
+ } else {
+ fq->flags |= INET_FRAG_HASH_DEAD;
+ }
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -168,7 +222,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
- struct inet_frags *f = q->net->f;
+ struct inet_frags *f = q->fqdir->f;
if (f->destructor)
f->destructor(q);
@@ -199,7 +253,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
void inet_frag_destroy(struct inet_frag_queue *q)
{
- struct netns_frags *nf;
+ struct fqdir *fqdir;
unsigned int sum, sum_truesize = 0;
struct inet_frags *f;
@@ -207,18 +261,18 @@ void inet_frag_destroy(struct inet_frag_queue *q)
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
- nf = q->net;
- f = nf->f;
+ fqdir = q->fqdir;
+ f = fqdir->f;
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
- sub_frag_mem_limit(nf, sum);
+ sub_frag_mem_limit(fqdir, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
struct inet_frags *f,
void *arg)
{
@@ -228,9 +282,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
if (!q)
return NULL;
- q->net = nf;
+ q->fqdir = fqdir;
f->constructor(q, arg);
- add_frag_mem_limit(nf, f->qsize);
+ add_frag_mem_limit(fqdir, f->qsize);
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
@@ -239,21 +293,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
return q;
}
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
void *arg,
struct inet_frag_queue **prev)
{
- struct inet_frags *f = nf->f;
+ struct inet_frags *f = fqdir->f;
struct inet_frag_queue *q;
- q = inet_frag_alloc(nf, f, arg);
+ q = inet_frag_alloc(fqdir, f, arg);
if (!q) {
*prev = ERR_PTR(-ENOMEM);
return NULL;
}
- mod_timer(&q->timer, jiffies + nf->timeout);
+ mod_timer(&q->timer, jiffies + fqdir->timeout);
- *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
+ *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
q->flags |= INET_FRAG_COMPLETE;
@@ -265,18 +319,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
}
/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
struct inet_frag_queue *fq = NULL, *prev;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
return NULL;
rcu_read_lock();
- prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+ prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
if (!prev)
- fq = inet_frag_create(nf, key, &prev);
+ fq = inet_frag_create(fqdir, key, &prev);
if (prev && !IS_ERR(prev)) {
fq = prev;
if (!refcount_inc_not_zero(&fq->refcnt))
@@ -387,7 +441,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
delta += head->truesize;
if (delta)
- add_frag_mem_limit(q->net, delta);
+ add_frag_mem_limit(q->fqdir, delta);
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
@@ -409,7 +463,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(q->net, clone->truesize);
+ add_frag_mem_limit(q->fqdir, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
@@ -462,7 +516,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
rbn = rbnext;
}
}
- sub_frag_mem_limit(q->net, head->truesize);
+ sub_frag_mem_limit(q->fqdir, head->truesize);
*nextp = NULL;
skb_mark_not_on_list(head);
@@ -490,7 +544,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
if (head == q->fragments_tail)
q->fragments_tail = NULL;
- sub_frag_mem_limit(q->net, head->truesize);
+ sub_frag_mem_limit(q->fqdir, head->truesize);
return head;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c4503073248b..97824864e40d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -316,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net,
saddr, sport, htonl(INADDR_ANY), hnum,
dif, sdif);
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cf2b0a6a3337..4385eb9e781f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
- frags);
- struct net *net = container_of(ipv4, struct net, ipv4);
+ struct net *net = q->fqdir->net;
const struct frag_v4_compare_key *key = a;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->net->max_dist ?
+ qp->peer = q->fqdir->max_dist ?
inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
NULL;
}
@@ -142,9 +140,13 @@ static void ip_expire(struct timer_list *t)
int err;
qp = container_of(frag, struct ipq, q);
- net = container_of(qp->q.net, struct net, ipv4.frags);
+ net = qp->q.fqdir->net;
rcu_read_lock();
+
+ if (qp->q.fqdir->dead)
+ goto out_rcu_unlock;
+
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -211,7 +213,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->ipv4.frags, &key);
+ q = inet_frag_find(net->ipv4.fqdir, &key);
if (!q)
return NULL;
@@ -222,7 +224,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = qp->q.net->max_dist;
+ unsigned int max = qp->q.fqdir->max_dist;
unsigned int start, end;
int rc;
@@ -236,12 +238,8 @@ static int ip_frag_too_far(struct ipq *qp)
rc = qp->q.fragments_tail && (end - start) > max;
- if (rc) {
- struct net *net;
-
- net = container_of(qp->q.net, struct net, ipv4.frags);
- __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
- }
+ if (rc)
+ __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
return rc;
}
@@ -250,13 +248,13 @@ static int ip_frag_reinit(struct ipq *qp)
{
unsigned int sum_truesize = 0;
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
+ sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
@@ -273,7 +271,7 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
struct sk_buff *prev_tail;
struct net_device *dev;
@@ -352,7 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(qp->q.net, skb->truesize);
+ add_frag_mem_limit(qp->q.fqdir, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
@@ -399,7 +397,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
void *reasm_data;
int len, err;
@@ -544,30 +542,24 @@ static int dist_min;
static struct ctl_table ip4_frags_ns_ctl_table[] = {
{
.procname = "ipfrag_high_thresh",
- .data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
- .data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
- .data = &init_net.ipv4.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ipfrag_max_dist",
- .data = &init_net.ipv4.frags.max_dist,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -600,13 +592,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv4.frags.high_thresh;
- table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[1].data = &net->ipv4.frags.low_thresh;
- table[1].extra2 = &net->ipv4.frags.high_thresh;
- table[2].data = &net->ipv4.frags.timeout;
- table[3].data = &net->ipv4.frags.max_dist;
}
+ table[0].data = &net->ipv4.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv4.fqdir->low_thresh;
+ table[1].data = &net->ipv4.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv4.fqdir->high_thresh;
+ table[2].data = &net->ipv4.fqdir->timeout;
+ table[3].data = &net->ipv4.fqdir->max_dist;
hdr = register_net_sysctl(net, "net/ipv4", table);
if (!hdr)
@@ -654,6 +646,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
+ res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+ if (res < 0)
+ return res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -668,36 +663,38 @@ static int __net_init ipv4_frags_init_net(struct net *net)
* we will prune down to 3MB, making room for approx 8 big 64K
* fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
- net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+ net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
* by TTL.
*/
- net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.fqdir->timeout = IP_FRAG_TIME;
- net->ipv4.frags.max_dist = 64;
- net->ipv4.frags.f = &ip4_frags;
+ net->ipv4.fqdir->max_dist = 64;
- res = inet_frags_init_net(&net->ipv4.frags);
- if (res < 0)
- return res;
res = ip4_frags_ns_ctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
return res;
}
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv4.fqdir);
+}
+
static void __net_exit ipv4_frags_exit_net(struct net *net)
{
ip4_frags_ns_ctl_unregister(net);
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
}
static struct pernet_operations ip4_frags_ops = {
- .init = ipv4_frags_init_net,
- .exit = ipv4_frags_exit_net,
+ .init = ipv4_frags_init_net,
+ .pre_exit = ipv4_frags_pre_exit_net,
+ .exit = ipv4_frags_exit_net,
};
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 3db31bb9df50..ddaa01ec2bce 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -473,6 +473,7 @@ error:
*info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
}
+EXPORT_SYMBOL(__ip_options_compile);
int ip_options_compile(struct net *net,
struct ip_options *opt, struct sk_buff *skb)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8c2ec35b6512..cc7ef0d05bbd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
return ret;
}
-static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -315,14 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
static int ip_mc_finish_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct rtable *new_rt;
- int ret;
+ bool do_cn = false;
+ int ret, err;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
+ switch (ret) {
+ case NET_XMIT_CN:
+ do_cn = true;
+ /* fall through */
+ case NET_XMIT_SUCCESS:
+ break;
+ default:
kfree_skb(skb);
return ret;
}
@@ -338,7 +354,8 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
skb_dst_set(skb, &new_rt->dst);
}
- return dev_loopback_xmit(net, sk, skb);
+ err = dev_loopback_xmit(net, sk, skb);
+ return (do_cn && err) ? ret : err;
}
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -537,9 +554,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_hash(to, from);
- /* Copy the flags to each fragment. */
- IPCB(to)->flags = IPCB(from)->flags;
-
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
@@ -573,6 +587,175 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return ip_do_fragment(net, sk, skb, output);
}
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+ unsigned int first_len = skb_pagelen(skb);
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->iph = iph;
+ iter->hlen = hlen;
+
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+static void ip_fraglist_ipcb_prepare(struct sk_buff *skb,
+ struct ip_fraglist_iter *iter)
+{
+ struct sk_buff *to = iter->frag;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(skb)->flags;
+
+ if (iter->offset == 0)
+ ip_options_fragment(to);
+}
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+ unsigned int hlen = iter->hlen;
+ struct iphdr *iph = iter->iph;
+ struct sk_buff *frag;
+
+ frag = iter->frag;
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iter->iph = ip_hdr(frag);
+ iph = iter->iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ iter->offset += skb->len - hlen;
+ iph->frag_off = htons(iter->offset >> 3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+ unsigned int ll_rs, unsigned int mtu,
+ struct ip_frag_state *state)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ state->hlen = hlen;
+ state->ll_rs = ll_rs;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
+ bool first_frag, struct ip_frag_state *state)
+{
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+ if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
+ state->iph->frag_off |= htons(IP_DF);
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (first_frag)
+ ip_options_fragment(from);
+}
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+ unsigned int len = state->left;
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+ if (!skb2)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, state->ll_rs);
+ skb_put(skb2, len + state->hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + state->hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+ BUG();
+ state->left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((state->offset >> 3));
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (state->left > 0 || state->not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ state->ptr += len;
+ state->offset += len;
+
+ iph->tot_len = htons(len + state->hlen);
+
+ ip_send_check(iph);
+
+ return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -584,12 +767,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
- int ptr;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs;
- int offset;
- __be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
+ unsigned int mtu, hlen, ll_rs;
+ struct ip_fraglist_iter iter;
+ struct ip_frag_state state;
int err = 0;
/* for offloaded checksums cleanup checksum before fragmentation */
@@ -654,49 +836,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
}
/* Everything is OK. Generate! */
-
- err = 0;
- offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- iph->tot_len = htons(first_len);
- iph->frag_off = htons(IP_MF);
- ip_send_check(iph);
+ ip_fraglist_init(skb, iph, hlen, &iter);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), iph, hlen);
- iph = ip_hdr(frag);
- iph->tot_len = htons(frag->len);
- ip_copy_metadata(frag, skb);
- if (offset == 0)
- ip_options_fragment(frag);
- offset += skb->len - hlen;
- iph->frag_off = htons(offset>>3);
- if (frag->next)
- iph->frag_off |= htons(IP_MF);
- /* Ready, complete checksum */
- ip_send_check(iph);
+ if (iter.frag) {
+ ip_fraglist_ipcb_prepare(skb, &iter);
+ ip_fraglist_prepare(skb, &iter);
}
err = output(net, sk, skb);
if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb_mark_not_on_list(skb);
+ skb = ip_fraglist_next(&iter);
}
if (err == 0) {
@@ -704,7 +861,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
@@ -720,105 +877,29 @@ slow_path_clean:
}
slow_path:
- iph = ip_hdr(skb);
-
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
- not_last_frag = iph->frag_off & htons(IP_MF);
+ ip_frag_init(skb, hlen, ll_rs, mtu, &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
+ while (state.left > 0) {
+ bool first_frag = (state.offset == 0);
- /* Allocate buffer */
- skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
- if (!skb2) {
- err = -ENOMEM;
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
goto fail;
}
-
- /*
- * Set up data on packet
- */
-
- ip_copy_metadata(skb2, skb);
- skb_reserve(skb2, ll_rs);
- skb_put(skb2, len + hlen);
- skb_reset_network_header(skb2);
- skb2->transport_header = skb2->network_header + hlen;
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
-
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
-
- skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
-
- /*
- * Copy a block of the IP datagram.
- */
- if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
- BUG();
- left -= len;
-
- /*
- * Fill in the new header fields.
- */
- iph = ip_hdr(skb2);
- iph->frag_off = htons((offset >> 3));
-
- if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
- iph->frag_off |= htons(IP_DF);
-
- /* ANK: dirty, but effective trick. Upgrade options only if
- * the segment to be fragmented was THE FIRST (otherwise,
- * options are already fixed) and make it ONCE
- * on the initial skb, so that all the following fragments
- * will inherit fixed options.
- */
- if (offset == 0)
- ip_options_fragment(skb);
-
- /*
- * Added AC : If we are fragmenting a fragment that's not the
- * last fragment then keep MF on each bit
- */
- if (left > 0 || not_last_frag)
- iph->frag_off |= htons(IP_MF);
- ptr += len;
- offset += len;
+ ip_frag_ipcb(skb, skb2, first_frag, &state);
/*
* Put this fragment into the sending queue.
*/
- iph->tot_len = htons(len + hlen);
-
- ip_send_check(iph);
-
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1568,7 +1649,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
- unsigned int len)
+ unsigned int len, u64 transmit_time)
{
struct ip_options_data replyopts;
struct ipcm_cookie ipc;
@@ -1584,6 +1665,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
ipcm_init(&ipc);
ipc.addr = daddr;
+ ipc.sockc.transmit_time = transmit_time;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 87ca2c42359b..a4e07e5e9c11 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par)
unsigned char *arpptr;
int pln, hln;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return NF_DROP;
arp = arp_hdr(skb);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 5f116c3749b4..5930d3b02555 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -29,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
__u8 oldtos;
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
iph = ip_hdr(skb);
oldtos = iph->tos;
@@ -58,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
tcph->cwr == einfo->proto.tcp.cwr))
return true;
- if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
return false;
tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 64d9563c0218..8e7f84ec783d 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -3,258 +3,11 @@
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct iphdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
- __be32 daddr)
-{
- struct iphdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- iph->version = 4;
- iph->ihl = sizeof(*iph) / 4;
- iph->tos = 0;
- iph->id = 0;
- iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
- iph->protocol = IPPROTO_TCP;
- iph->check = 0;
- iph->saddr = saddr;
- iph->daddr = daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct iphdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- skb_dst_set_noref(nskb, skb_dst(skb));
- nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
- goto free_nskb;
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
@@ -306,135 +59,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- unsigned int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb) ||
- ip_hdr(skb)->protocol != IPPROTO_TCP)
- return NF_ACCEPT;
-
- thoff = ip_hdrlen(skb);
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv4_synproxy_ops[] = {
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg4_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -449,16 +73,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref4 == 0) {
- err = nf_register_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv4_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref4++;
return err;
}
@@ -466,10 +86,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref4--;
- if (snet->hook_ref4 == 0)
- nf_unregister_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
+ nf_synproxy_ipv4_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 6eefde5bc468..69697eb4bfc6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -2,7 +2,7 @@
/*
* 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index dfea10f13878..87b711fd5a44 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -6,7 +6,7 @@
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#include <linux/module.h>
@@ -58,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
- /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+ /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy
* or pull everything in a linear buffer, so we can safely
* use the skb pointers now */
*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index 657d2dcec3cc..717b726504fe 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index b6dd39636bea..b2bae0b0e42a 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -49,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
+ const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
@@ -57,10 +58,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
- for_primary_ifa(indev) {
+
+ in_dev_for_each_ifa_rcu(ifa, indev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
laddr = ifa->ifa_local;
break;
- } endfor_ifa(indev);
+ }
return laddr ? laddr : daddr;
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..5fe5a3981d43
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,1828 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
+#include <net/ndisc.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/sock.h>
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS 8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
+ [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 },
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_GROUP] = { .type = NLA_BINARY },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_BLACKHOLE] = { .type = NLA_FLAG },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GATEWAY] = { .type = NLA_BINARY },
+ [NHA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [NHA_ENCAP] = { .type = NLA_NESTED },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
+ [NHA_MASTER] = { .type = NLA_U32 },
+};
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+ unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+ return (val ^
+ (val >> NH_DEV_HASHBITS) ^
+ (val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+ struct net_device *dev = nhi->fib_nhc.nhc_dev;
+ struct hlist_head *head;
+ unsigned int hash;
+
+ WARN_ON(!dev);
+
+ hash = nh_dev_hashfn(dev->ifindex);
+ head = &net->nexthop.devhash[hash];
+ hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_mpath(struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i)
+ WARN_ON(nhg->nh_entries[i].nh);
+
+ kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
+{
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh_release(nh->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+ struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+ if (nh->is_group)
+ nexthop_free_mpath(nh);
+ else
+ nexthop_free_single(nh);
+
+ kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+ struct nexthop *nh;
+
+ nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+ if (nh) {
+ INIT_LIST_HEAD(&nh->fi_list);
+ INIT_LIST_HEAD(&nh->f6i_list);
+ INIT_LIST_HEAD(&nh->grp_list);
+ }
+ return nh;
+}
+
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+ size_t sz = offsetof(struct nexthop, nh_grp)
+ + sizeof(struct nh_group)
+ + sizeof(struct nh_grp_entry) * num_nh;
+ struct nh_group *nhg;
+
+ nhg = kzalloc(sz, GFP_KERNEL);
+ if (nhg)
+ nhg->num_nh = num_nh;
+
+ return nhg;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+ while (++net->nexthop.seq == 0)
+ ;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+
+ pp = &net->nexthop.rb_root.rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (id < nh->id)
+ pp = &next->rb_left;
+ else if (id > nh->id)
+ pp = &next->rb_right;
+ else
+ return nh;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+ u32 id_start = net->nexthop.last_id_allocated;
+
+ while (1) {
+ net->nexthop.last_id_allocated++;
+ if (net->nexthop.last_id_allocated == id_start)
+ break;
+
+ if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+ return net->nexthop.last_id_allocated;
+ }
+ return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
+{
+ struct nexthop_grp *p;
+ size_t len = nhg->num_nh * sizeof(*p);
+ struct nlattr *nla;
+ u16 group_type = 0;
+ int i;
+
+ if (nhg->mpath)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ p->id = nhg->nh_entries[i].nh->id;
+ p->weight = nhg->nh_entries[i].weight - 1;
+ p += 1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+ int event, u32 portid, u32 seq, unsigned int nlflags)
+{
+ struct fib6_nh *fib6_nh;
+ struct fib_nh *fib_nh;
+ struct nlmsghdr *nlh;
+ struct nh_info *nhi;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = nh->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nla_put_nh_group(skb, nhg))
+ goto nla_put_failure;
+ goto out;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ nhm->nh_family = nhi->family;
+ if (nhi->reject_nh) {
+ if (nla_put_flag(skb, NHA_BLACKHOLE))
+ goto nla_put_failure;
+ goto out;
+ } else {
+ const struct net_device *dev;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh = &nhi->fib_nh;
+ if (fib_nh->fib_nh_gw_family &&
+ nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ goto nla_put_failure;
+ break;
+
+ case AF_INET6:
+ fib6_nh = &nhi->fib6_nh;
+ if (fib6_nh->fib_nh_gw_family &&
+ nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate &&
+ lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+ NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+
+ return nla_total_size(sz) +
+ nla_total_size(2); /* NHA_GROUP_TYPE */
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+ size_t sz;
+
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz = nla_total_size(4); /* NHA_OIF */
+
+ switch (nhi->family) {
+ case AF_INET:
+ if (nhi->fib_nh.fib_nh_gw_family)
+ sz += nla_total_size(4); /* NHA_GATEWAY */
+ break;
+
+ case AF_INET6:
+ /* NHA_GATEWAY */
+ if (nhi->fib6_nh.fib_nh_gw_family)
+ sz += nla_total_size(sizeof(const struct in6_addr));
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate) {
+ sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+ sz += nla_total_size(2); /* NHA_ENCAP_TYPE */
+ }
+
+ return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+ size_t sz = nla_total_size(4); /* NHA_ID */
+
+ if (nh->is_group)
+ sz += nh_nlmsg_size_grp(nh);
+ else
+ sz += nh_nlmsg_size_single(nh);
+
+ return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+ unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+ struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ /* nested multipath (group within a group) is not
+ * supported
+ */
+ if (nhg->mpath) {
+ NL_SET_ERR_MSG(extack,
+ "Multipath group can not be a nexthop within a group");
+ return false;
+ }
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh && npaths > 1) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group with more than 1 path");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ unsigned int i, j;
+
+ if (len & (sizeof(struct nexthop_grp) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ if (nhg[i].resvd1 || nhg[i].resvd2) {
+ NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
+ return -EINVAL;
+ }
+ if (nhg[i].weight > 254) {
+ NL_SET_ERR_MSG(extack, "Invalid value for weight");
+ return -EINVAL;
+ }
+ for (j = i + 1; j < len; ++j) {
+ if (nhg[i].id == nhg[j].id) {
+ NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+ return -EINVAL;
+ }
+ }
+ }
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, nhg[i].id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, len, extack))
+ return -EINVAL;
+ }
+ for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+
+ return !!(state & NUD_VALID);
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+ struct nexthop *rc = NULL;
+ struct nh_group *nhg;
+ int i;
+
+ if (!nh->is_group)
+ return nh;
+
+ nhg = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi;
+
+ if (hash > atomic_read(&nhge->upper_bound))
+ continue;
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ nhi = rcu_dereference(nhge->nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ if (ipv4_good_nh(&nhi->fib_nh))
+ return nhge->nh;
+ break;
+ case AF_INET6:
+ if (ipv6_good_nh(&nhi->fib6_nh))
+ return nhge->nh;
+ break;
+ }
+
+ if (!rc)
+ rc = nhge->nh;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+ int (*cb)(struct fib6_nh *nh, void *arg),
+ void *arg)
+{
+ struct nh_info *nhi;
+ int err;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
+static int check_src_addr(const struct in6_addr *saddr,
+ struct netlink_ext_ack *extack)
+{
+ if (!ipv6_addr_any(saddr)) {
+ NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ /* fib6_src is unique to a fib6_info and limits the ability to cache
+ * routes in fib6_nh within a nexthop that is potentially shared
+ * across multiple fib entries. If the config wants to use source
+ * routing it can not use nexthop objects. mlxsw also does not allow
+ * fib6_src on routes.
+ */
+ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
+ return -EINVAL;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->has_v4)
+ goto no_v4_nh;
+ } else {
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->family == AF_INET)
+ goto no_v4_nh;
+ }
+
+ return 0;
+no_v4_nh:
+ NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
+/* if existing nexthop has ipv6 routes linked to it, need
+ * to verify this new spec works with ipv6
+ */
+static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *f6i;
+
+ if (list_empty(&old->f6i_list))
+ return 0;
+
+ list_for_each_entry(f6i, &old->f6i_list, nh_list) {
+ if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
+ return -EINVAL;
+ }
+
+ return fib6_check_nexthop(new, NULL, extack);
+}
+
+static int nexthop_check_scope(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have a gateway");
+ return -EINVAL;
+ }
+
+ if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ if (scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ err = -EINVAL;
+ goto out;
+ }
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ /* all nexthops in a group have the same scope */
+ err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
+ } else {
+ err = nexthop_check_scope(nh, scope, extack);
+ }
+out:
+ return err;
+}
+
+static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &old->fi_list, nh_list) {
+ int err;
+
+ err = fib_check_nexthop(new, fi->fib_scope, extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+ int total = 0;
+ int w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ int upper_bound;
+
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+ atomic_set(&nhge->upper_bound, upper_bound);
+ }
+}
+
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge,
+ struct nh_group *nhg,
+ struct nl_info *nlinfo)
+{
+ struct nexthop *nh = nhge->nh;
+ struct nh_grp_entry *nhges;
+ bool found = false;
+ int i;
+
+ WARN_ON(!nh);
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; ++i) {
+ if (found) {
+ nhges[i-1].nh = nhges[i].nh;
+ nhges[i-1].weight = nhges[i].weight;
+ list_del(&nhges[i].nh_list);
+ list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list);
+ } else if (nhg->nh_entries[i].nh == nh) {
+ found = true;
+ }
+ }
+
+ if (WARN_ON(!found))
+ return;
+
+ nhg->num_nh--;
+ nhg->nh_entries[nhg->num_nh].nh = NULL;
+
+ nh_group_rebalance(nhg);
+
+ nexthop_put(nh);
+
+ if (nlinfo)
+ nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+ struct nh_group *nhg;
+
+ list_del(&nhge->nh_list);
+ nhg = rtnl_dereference(nhge->nh_parent->nh_grp);
+ remove_nh_grp_entry(nhge, nhg, nlinfo);
+
+ /* if this group has no more entries then remove it */
+ if (!nhg->num_nh)
+ remove_nexthop(net, nhge->nh_parent, nlinfo);
+ }
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+ struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+ int i, num_nh = nhg->num_nh;
+
+ for (i = 0; i < num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (WARN_ON(!nhge->nh))
+ continue;
+
+ list_del(&nhge->nh_list);
+ nexthop_put(nhge->nh);
+ nhge->nh = NULL;
+ nhg->num_nh--;
+ }
+}
+
+/* not called for nexthop replace */
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i, *tmp;
+ bool do_flush = false;
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ do_flush = true;
+ }
+ if (do_flush)
+ fib_flush(net);
+
+ /* ip6_del_rt removes the entry from this list hence the _safe */
+ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+ /* __ip6_del_rt does a release, so do a hold here */
+ fib6_info_hold(f6i);
+ ipv6_stub->ip6_del_rt(net, f6i);
+ }
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ __remove_nexthop_fib(net, nh);
+
+ if (nh->is_group) {
+ remove_nexthop_group(nh, nlinfo);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fib_nhc.nhc_dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, nlinfo);
+ }
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ /* remove from the tree */
+ rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+ __remove_nexthop(net, nh, nlinfo);
+ nh_base_seq_inc(net);
+
+ nexthop_put(nh);
+}
+
+/* if any FIB entries reference this nexthop, any dst entries
+ * need to be regenerated
+ */
+static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list))
+ rt_cache_flush(net);
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_update_sernum(net, f6i);
+}
+
+static int replace_nexthop_grp(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *oldg, *newg;
+ int i;
+
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; i++)
+ newg->nh_entries[i].nh_parent = old;
+
+ rcu_assign_pointer(old->nh_grp, newg);
+
+ for (i = 0; i < oldg->num_nh; i++)
+ oldg->nh_entries[i].nh_parent = new;
+
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ return 0;
+}
+
+static int replace_nexthop_single(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *oldi, *newi;
+
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+
+ old->protocol = new->protocol;
+ old->nh_flags = new->nh_flags;
+
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ return 0;
+}
+
+static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list)) {
+ struct fib_info *fi;
+
+ /* expectation is a few fib_info per nexthop and then
+ * a lot of routes per fib_info. So mark the fib_info
+ * and then walk the fib tables once
+ */
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = true;
+
+ fib_info_notify_update(net, info);
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = false;
+ }
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_rt_update(net, f6i, info);
+}
+
+/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
+ * linked to this nexthop and for all groups that the nexthop
+ * is a member of
+ */
+static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct nh_grp_entry *nhge;
+
+ __nexthop_replace_notify(net, nh, info);
+
+ list_for_each_entry(nhge, &nh->grp_list, nh_list)
+ __nexthop_replace_notify(net, nhge->nh_parent, info);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+ struct nexthop *new, struct netlink_ext_ack *extack)
+{
+ bool new_is_reject = false;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ /* check that existing FIB entries are ok with the
+ * new nexthop definition
+ */
+ err = fib_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ if (!new->is_group) {
+ struct nh_info *nhi = rtnl_dereference(new->nh_info);
+
+ new_is_reject = nhi->reject_nh;
+ }
+
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ /* if new nexthop is a blackhole, any groups using this
+ * nexthop cannot have more than 1 path
+ */
+ if (new_is_reject &&
+ nexthop_num_path(nhge->nh_parent) > 1) {
+ NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
+ return -EINVAL;
+ }
+
+ err = fib_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+ }
+
+ if (old->is_group)
+ err = replace_nexthop_grp(net, old, new, extack);
+ else
+ err = replace_nexthop_single(net, old, new, extack);
+
+ if (!err) {
+ nh_rt_cache_flush(net, old);
+
+ __remove_nexthop(net, new, NULL);
+ nexthop_put(new);
+ }
+
+ return err;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+ struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+ bool create = !!(cfg->nlflags & NLM_F_CREATE);
+ u32 new_id = new_nh->id;
+ int replace_notify = 0;
+ int rc = -EEXIST;
+
+ pp = &root->rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rtnl_dereference(*pp);
+ if (!next)
+ break;
+
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (new_id < nh->id) {
+ pp = &next->rb_left;
+ } else if (new_id > nh->id) {
+ pp = &next->rb_right;
+ } else if (replace) {
+ rc = replace_nexthop(net, nh, new_nh, extack);
+ if (!rc) {
+ new_nh = nh; /* send notification with old nh */
+ replace_notify = 1;
+ }
+ goto out;
+ } else {
+ /* id already exists and not a replace */
+ goto out;
+ }
+ }
+
+ if (replace && !create) {
+ NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+ rc = -ENOENT;
+ goto out;
+ }
+
+ rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+ rb_insert_color(&new_nh->rb_node, root);
+ rc = 0;
+out:
+ if (!rc) {
+ nh_base_seq_inc(net);
+ nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+ if (replace_notify)
+ nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
+ }
+
+ return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev != dev)
+ continue;
+
+ remove_nexthop(net, nhi->nh_parent, NULL);
+ }
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ struct nexthop *nh;
+
+ while ((node = rb_first(root))) {
+ nh = rb_entry(node, struct nexthop, rb_node);
+ remove_nexthop(net, nh, NULL);
+ cond_resched();
+ }
+}
+
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int i;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh->is_group = 1;
+
+ nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+ if (!nhg) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nexthop *nhe;
+ struct nh_info *nhi;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ if (!nexthop_get(nhe))
+ goto out_no_nh;
+
+ nhi = rtnl_dereference(nhe->nh_info);
+ if (nhi->family == AF_INET)
+ nhg->has_v4 = true;
+
+ nhg->nh_entries[i].nh = nhe;
+ nhg->nh_entries[i].weight = entry[i].weight + 1;
+ list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+ nhg->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nhg->mpath = 1;
+ nh_group_rebalance(nhg);
+ }
+
+ rcu_assign_pointer(nh->nh_grp, nhg);
+
+ return nh;
+
+out_no_nh:
+ for (; i >= 0; --i)
+ nexthop_put(nhg->nh_entries[i].nh);
+
+ kfree(nhg);
+ kfree(nh);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_nh *fib_nh = &nhi->fib_nh;
+ struct fib_config fib_cfg = {
+ .fc_oif = cfg->nh_ifindex,
+ .fc_gw4 = cfg->gw.ipv4,
+ .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+ .fc_flags = cfg->nh_flags,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ u32 tb_id = l3mdev_fib_table(cfg->dev);
+ int err = -EINVAL;
+
+ err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+ if (err) {
+ fib_nh_release(net, fib_nh);
+ goto out;
+ }
+
+ /* sets nh_dev if successful */
+ err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+ if (!err) {
+ nh->nh_flags = fib_nh->fib_nh_flags;
+ fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+ fib_nh->fib_nh_scope);
+ } else {
+ fib_nh_release(net, fib_nh);
+ }
+out:
+ return err;
+}
+
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+ struct fib6_config fib6_cfg = {
+ .fc_table = l3mdev_fib_table(cfg->dev),
+ .fc_ifindex = cfg->nh_ifindex,
+ .fc_gateway = cfg->gw.ipv6,
+ .fc_flags = cfg->nh_flags,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ int err;
+
+ if (!ipv6_addr_any(&cfg->gw.ipv6))
+ fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+ /* sets nh_dev if successful */
+ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+ extack);
+ if (err)
+ ipv6_stub->fib6_nh_release(fib6_nh);
+ else
+ nh->nh_flags = fib6_nh->fib_nh_flags;
+
+ return err;
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ struct nexthop *nh;
+ int err = 0;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+ if (!nhi) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->nh_flags = cfg->nh_flags;
+ nh->net = net;
+
+ nhi->nh_parent = nh;
+ nhi->family = cfg->nh_family;
+ nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+ if (cfg->nh_blackhole) {
+ nhi->reject_nh = 1;
+ cfg->nh_ifindex = net->loopback_dev->ifindex;
+ }
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+ break;
+ case AF_INET6:
+ err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+ break;
+ }
+
+ if (err) {
+ kfree(nhi);
+ kfree(nh);
+ return ERR_PTR(err);
+ }
+
+ /* add the entry to the device based hash */
+ nexthop_devhash_add(net, nhi);
+
+ rcu_assign_pointer(nh->nh_info, nhi);
+
+ return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nexthop *nh;
+ int err;
+
+ if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!cfg->nh_id) {
+ cfg->nh_id = nh_find_unused_id(net);
+ if (!cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "No unused id");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
+ if (IS_ERR(nh))
+ return nh;
+
+ refcount_set(&nh->refcnt, 1);
+ nh->id = cfg->nh_id;
+ nh->protocol = cfg->nh_protocol;
+ nh->net = net;
+
+ err = insert_nexthop(net, nh, cfg, extack);
+ if (err) {
+ __remove_nexthop(net, nh, NULL);
+ nexthop_put(nh);
+ nh = ERR_PTR(err);
+ }
+
+ return nh;
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ struct nlattr *tb[NHA_MAX + 1];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (nhm->resvd || nhm->nh_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+ goto out;
+ }
+ if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+ goto out;
+ }
+
+ switch (nhm->nh_family) {
+ case AF_INET:
+ case AF_INET6:
+ break;
+ case AF_UNSPEC:
+ if (tb[NHA_GROUP])
+ break;
+ /* fallthrough */
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid address family");
+ goto out;
+ }
+
+ if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
+ NL_SET_ERR_MSG(extack, "Invalid attributes in request");
+ goto out;
+ }
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->nlflags = nlh->nlmsg_flags;
+ cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->nlinfo.nlh = nlh;
+ cfg->nlinfo.nl_net = net;
+
+ cfg->nh_family = nhm->nh_family;
+ cfg->nh_protocol = nhm->nh_protocol;
+ cfg->nh_flags = nhm->nh_flags;
+
+ if (tb[NHA_ID])
+ cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+ if (tb[NHA_GROUP]) {
+ if (nhm->nh_family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid family for group");
+ goto out;
+ }
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+ err = nh_check_attr_group(net, tb, extack);
+
+ /* no other attributes should be set */
+ goto out;
+ }
+
+ if (tb[NHA_BLACKHOLE]) {
+ if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+ goto out;
+ }
+
+ cfg->nh_blackhole = 1;
+ err = 0;
+ goto out;
+ }
+
+ if (!tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+ goto out;
+ }
+
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ goto out;
+ } else if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ } else if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ err = -ENETDOWN;
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tb[NHA_GATEWAY]) {
+ struct nlattr *gwa = tb[NHA_GATEWAY];
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ if (nla_len(gwa) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv4 = nla_get_be32(gwa);
+ break;
+ case AF_INET6:
+ if (nla_len(gwa) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Unknown address family for gateway");
+ goto out;
+ }
+ } else {
+ /* device only nexthop (no gateway) */
+ if (cfg->nh_flags & RTNH_F_ONLINK) {
+ NL_SET_ERR_MSG(extack,
+ "ONLINK flag can not be set for nexthop without a gateway");
+ goto out;
+ }
+ }
+
+ if (tb[NHA_ENCAP]) {
+ cfg->nh_encap = tb[NHA_ENCAP];
+
+ if (!tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+ goto out;
+ }
+
+ cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+ if (err < 0)
+ goto out;
+
+ } else if (tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+ goto out;
+ }
+
+
+ err = 0;
+out:
+ return err;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nh_config cfg;
+ struct nexthop *nh;
+ int err;
+
+ err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
+ if (!err) {
+ nh = nexthop_add(net, &cfg, extack);
+ if (IS_ERR(nh))
+ err = PTR_ERR(nh);
+ }
+
+ return err;
+}
+
+static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ struct nlattr *tb[NHA_MAX + 1];
+ int err, i;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ for (i = 0; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NHA_ID:
+ break;
+ default:
+ NL_SET_ERR_MSG_ATTR(extack, tb[i],
+ "Unexpected attribute in request");
+ goto out;
+ }
+ }
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header");
+ goto out;
+ }
+
+ if (!tb[NHA_ID]) {
+ NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+ goto out;
+ }
+
+ *id = nla_get_u32(tb[NHA_ID]);
+ if (!(*id))
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ else
+ err = 0;
+out:
+ return err;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nl_info nlinfo = {
+ .nlh = nlh,
+ .nl_net = net,
+ .portid = NETLINK_CB(skb).portid,
+ };
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_del_req(nlh, &id, extack);
+ if (err)
+ return err;
+
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ return -ENOENT;
+
+ remove_nexthop(net, nh, &nlinfo);
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *skb = NULL;
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_del_req(nlh, &id, extack);
+ if (err)
+ return err;
+
+ err = -ENOBUFS;
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ err = -ENOENT;
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ goto errout_free;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ return err;
+errout_free:
+ kfree_skb(skb);
+ goto out;
+}
+
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
+ bool group_filter, u8 family)
+{
+ const struct net_device *dev;
+ const struct nh_info *nhi;
+
+ if (group_filter && !nh->is_group)
+ return true;
+
+ if (!dev_idx && !master_idx && !family)
+ return false;
+
+ if (nh->is_group)
+ return true;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (family && nhi->family != family)
+ return true;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev_idx && (!dev || dev->ifindex != dev_idx))
+ return true;
+
+ if (master_idx) {
+ struct net_device *master;
+
+ if (!dev)
+ return true;
+
+ master = netdev_master_upper_dev_get((struct net_device *)dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+ }
+
+ return false;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
+ int *master_idx, bool *group_filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[NHA_MAX + 1];
+ struct nhmsg *nhm;
+ int err, i;
+ u32 idx;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NHA_OIF:
+ idx = nla_get_u32(tb[i]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+ *dev_idx = idx;
+ break;
+ case NHA_MASTER:
+ idx = nla_get_u32(tb[i]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid master device index");
+ return -EINVAL;
+ }
+ *master_idx = idx;
+ break;
+ case NHA_GROUPS:
+ *group_filter = true;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ nhm = nlmsg_data(nlh);
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ int dev_filter_idx = 0, master_idx = 0;
+ struct net *net = sock_net(skb->sk);
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool group_filter = false;
+ struct rb_node *node;
+ int idx = 0, s_idx;
+ int err;
+
+ err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
+ &group_filter, cb);
+ if (err < 0)
+ return err;
+
+ s_idx = cb->args[0];
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ if (idx < s_idx)
+ goto cont;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+ if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
+ group_filter, nhm->nh_family))
+ goto cont;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
+cont:
+ idx++;
+ }
+
+out:
+ err = skb->len;
+out_err:
+ cb->args[0] = idx;
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+ return err;
+}
+
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev == dev) {
+ if (nhi->family == AF_INET)
+ fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+ orig_mtu);
+ }
+ }
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_info_ext *info_ext;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_flush_dev(dev);
+ break;
+ case NETDEV_CHANGE:
+ if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+ nexthop_flush_dev(dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ info_ext = ptr;
+ nexthop_sync_mtu(dev, info_ext->ext.mtu);
+ rt_cache_flush(dev_net(dev));
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+ .notifier_call = nh_netdev_event,
+};
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+ rtnl_lock();
+ flush_all_nexthops(net);
+ rtnl_unlock();
+ kfree(net->nexthop.devhash);
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+ size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+ net->nexthop.rb_root = RB_ROOT;
+ net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+ if (!net->nexthop.devhash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+ .init = nexthop_net_init,
+ .exit = nexthop_net_exit,
+};
+
+static int __init nexthop_init(void)
+{
+ register_pernet_subsys(&nexthop_net_ops);
+
+ register_netdevice_notifier(&nh_netdev_notifier);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
+ rtm_dump_nexthop, 0);
+
+ rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+ rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+ return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 073273b751f8..cc90243ccf76 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -68,8 +68,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
seq_printf(seq, "FRAG: inuse %u memory %lu\n",
- atomic_read(&net->ipv4.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv4.frags));
+ atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv4.fqdir));
return 0;
}
@@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8ea0735a6754..a3e466b6a60c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -95,6 +95,7 @@
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/arp.h>
#include <net/tcp.h>
#include <net/icmp.h>
@@ -1580,7 +1581,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
#ifdef CONFIG_IP_ROUTE_CLASSID
- {
+ if (nhc->nhc_family == AF_INET) {
struct fib_nh *nh;
nh = container_of(nhc, struct fib_nh, nh_common);
@@ -1962,6 +1963,23 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else {
+ /* Same as case 0 */
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ break;
}
mhash = flow_hash_from_keys(&hash_keys);
@@ -1979,7 +1997,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
@@ -2714,7 +2732,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
@@ -2742,7 +2760,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
goto nla_put_failure;
#endif
- if (!rt_is_input_route(rt) &&
+ if (fl4 && !rt_is_input_route(rt) &&
fl4->saddr != src) {
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
@@ -2782,36 +2800,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (fl4->flowi4_mark &&
- nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
- goto nla_put_failure;
-
- if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
- nla_put_u32(skb, RTA_UID,
- from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
- goto nla_put_failure;
+ if (fl4) {
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
- error = rt->dst.error;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(),
+ fl4->flowi4_uid)))
+ goto nla_put_failure;
- if (rt_is_input_route(rt)) {
+ if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb,
- fl4->saddr, fl4->daddr,
- r, portid);
-
- if (err <= 0) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- }
- } else
+ if (ipv4_is_multicast(dst) &&
+ !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, portid);
+
+ if (err <= 0) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ }
+ } else
#endif
- if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
- goto nla_put_failure;
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+ goto nla_put_failure;
+ }
}
+ error = rt->dst.error;
+
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
@@ -2823,6 +2845,80 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, u32 table_id,
+ struct fnhe_hash_bucket *bucket, int genid,
+ int *fa_index, int fa_start)
+{
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ struct rtable *rt;
+ int err;
+
+ if (*fa_index < fa_start)
+ goto next;
+
+ if (fnhe->fnhe_genid != genid)
+ goto next;
+
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires))
+ goto next;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (!rt)
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (!rt)
+ goto next;
+
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+ table_id, NULL, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err)
+ return err;
+next:
+ (*fa_index)++;
+ }
+ }
+
+ return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+ u32 table_id, struct fib_info *fi,
+ int *fa_index, int fa_start)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ int nhsel, genid = fnhe_genid(net);
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+ struct fnhe_hash_bucket *bucket;
+ int err;
+
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ continue;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(nhc->nhc_exceptions);
+ err = 0;
+ if (bucket)
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+ genid, fa_index, fa_start);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
u8 ip_proto, __be16 sport,
__be16 dport)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b6f14af926fa..7d66306b5f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -279,55 +279,96 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
+static int sscanf_key(char *buf, __le32 *key)
+{
+ u32 user_key[4];
+ int i, ret = 0;
+
+ if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ } else {
+ for (i = 0; i < ARRAY_SIZE(user_key); i++)
+ key[i] = cpu_to_le32(user_key[i]);
+ }
+ pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3], buf, ret);
+
+ return ret;
+}
+
static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_tcp_fastopen);
- struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
- struct tcp_fastopen_context *ctxt;
- u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
- __le32 key[4];
- int ret, i;
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)) };
+ struct tcp_fastopen_context *ctx;
+ u32 user_key[TCP_FASTOPEN_KEY_MAX * 4];
+ __le32 key[TCP_FASTOPEN_KEY_MAX * 4];
+ char *backup_data;
+ int ret, i = 0, off = 0, n_keys = 0;
tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
if (!tbl.data)
return -ENOMEM;
rcu_read_lock();
- ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
- if (ctxt)
- memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
- else
- memset(key, 0, sizeof(key));
+ ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctx) {
+ n_keys = tcp_fastopen_context_len(ctx);
+ memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys);
+ }
rcu_read_unlock();
- for (i = 0; i < ARRAY_SIZE(key); i++)
+ if (!n_keys) {
+ memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH);
+ n_keys = 1;
+ }
+
+ for (i = 0; i < n_keys * 4; i++)
user_key[i] = le32_to_cpu(key[i]);
- snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
- user_key[0], user_key[1], user_key[2], user_key[3]);
+ for (i = 0; i < n_keys; i++) {
+ off += snprintf(tbl.data + off, tbl.maxlen - off,
+ "%08x-%08x-%08x-%08x",
+ user_key[i * 4],
+ user_key[i * 4 + 1],
+ user_key[i * 4 + 2],
+ user_key[i * 4 + 3]);
+ if (i + 1 < n_keys)
+ off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
+ }
+
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
- user_key + 2, user_key + 3) != 4) {
+ backup_data = strchr(tbl.data, ',');
+ if (backup_data) {
+ *backup_data = '\0';
+ backup_data++;
+ }
+ if (sscanf_key(tbl.data, key)) {
ret = -EINVAL;
goto bad_key;
}
-
- for (i = 0; i < ARRAY_SIZE(user_key); i++)
- key[i] = cpu_to_le32(user_key[i]);
-
+ if (backup_data) {
+ if (sscanf_key(backup_data, key + 4)) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ }
tcp_fastopen_reset_cipher(net, NULL, key,
- TCP_FASTOPEN_KEY_LENGTH);
+ backup_data ? key + 4 : NULL);
}
bad_key:
- pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
- user_key[0], user_key[1], user_key[2], user_key[3],
- (char *)tbl.data, ret);
kfree(tbl.data);
return ret;
}
@@ -956,7 +997,12 @@ static struct ctl_table ipv4_net_table[] = {
.procname = "tcp_fastopen_key",
.mode = 0600,
.data = &init_net.ipv4.sysctl_tcp_fastopen,
- .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)),
.proc_handler = proc_tcp_fastopen_key,
},
{
@@ -984,7 +1030,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = &zero,
- .extra2 = &one,
+ .extra2 = &two,
},
#endif
{
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7dc9ab84bb69..47c217905864 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2741,6 +2741,21 @@ static int tcp_repair_options_est(struct sock *sk,
return 0;
}
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+}
+
/*
* Socket option code for TCP.
*/
@@ -2791,15 +2806,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
return err;
}
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
- if (optlen != sizeof(key))
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
return -EINVAL;
if (copy_from_user(key, optval, optlen))
return -EFAULT;
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
}
default:
/* fallthru */
@@ -3083,6 +3106,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+ case TCP_TX_DELAY:
+ if (val)
+ tcp_enable_tx_delay();
+ tp->tcp_tx_delay = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3453,21 +3481,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
struct tcp_fastopen_context *ctx;
+ unsigned int key_len = 0;
if (get_user(len, optlen))
return -EFAULT;
rcu_read_lock();
ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
- if (ctx)
- memcpy(key, ctx->key, sizeof(key));
- else
- len = 0;
+ if (ctx) {
+ key_len = tcp_fastopen_context_len(ctx) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ memcpy(&key[0], &ctx->key[0], key_len);
+ }
rcu_read_unlock();
- len = min_t(unsigned int, len, sizeof(key));
+ len = min_t(unsigned int, len, key_len);
if (put_user(len, optlen))
return -EFAULT;
if (copy_to_user(optval, key, len))
@@ -3540,6 +3570,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_no_cookie;
break;
+ case TCP_TX_DELAY:
+ val = tp->tcp_tx_delay;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f5a45e1e1182..3fd451271a70 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -30,15 +30,15 @@ void tcp_fastopen_init_key_once(struct net *net)
* for a valid cookie, so this is an acceptable risk.
*/
get_random_bytes(key, sizeof(key));
- tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, NULL, key, NULL);
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
container_of(head, struct tcp_fastopen_context, rcu);
- crypto_free_cipher(ctx->tfm);
- kfree(ctx);
+
+ kzfree(ctx);
}
void tcp_fastopen_destroy_cipher(struct sock *sk)
@@ -67,31 +67,27 @@ void tcp_fastopen_ctx_destroy(struct net *net)
}
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
- void *key, unsigned int len)
+ void *primary_key, void *backup_key)
{
struct tcp_fastopen_context *ctx, *octx;
struct fastopen_queue *q;
- int err;
+ int err = 0;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-
- if (IS_ERR(ctx->tfm)) {
- err = PTR_ERR(ctx->tfm);
-error: kfree(ctx);
- pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
- return err;
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
}
- err = crypto_cipher_setkey(ctx->tfm, key, len);
- if (err) {
- pr_err("TCP: TFO cipher key error: %d\n", err);
- crypto_free_cipher(ctx->tfm);
- goto error;
- }
- memcpy(ctx->key, key, len);
+ ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+ ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
+ if (backup_key) {
+ ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+ ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
+ ctx->num = 2;
+ } else {
+ ctx->num = 1;
+ }
spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
if (sk) {
@@ -108,66 +104,58 @@ error: kfree(ctx);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
return err;
}
-static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
- struct tcp_fastopen_cookie *foc)
+static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
+ struct sk_buff *syn,
+ const siphash_key_t *key,
+ struct tcp_fastopen_cookie *foc)
{
- struct tcp_fastopen_context *ctx;
- bool ok = false;
-
- rcu_read_lock();
-
- ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
- if (!ctx)
- ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+ BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
- if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
- foc->len = TCP_FASTOPEN_COOKIE_SIZE;
- ok = true;
- }
- rcu_read_unlock();
- return ok;
-}
-
-/* Generate the fastopen cookie by doing aes128 encryption on both
- * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
- * addresses. For the longer IPv6 addresses use CBC-MAC.
- *
- * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
- */
-static bool tcp_fastopen_cookie_gen(struct sock *sk,
- struct request_sock *req,
- struct sk_buff *syn,
- struct tcp_fastopen_cookie *foc)
-{
if (req->rsk_ops->family == AF_INET) {
const struct iphdr *iph = ip_hdr(syn);
- __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
- return __tcp_fastopen_cookie_gen(sk, path, foc);
+ foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+ sizeof(iph->saddr) +
+ sizeof(iph->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
-
#if IS_ENABLED(CONFIG_IPV6)
if (req->rsk_ops->family == AF_INET6) {
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
- struct tcp_fastopen_cookie tmp;
-
- if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
- struct in6_addr *buf = &tmp.addr;
- int i;
- for (i = 0; i < 4; i++)
- buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
- return __tcp_fastopen_cookie_gen(sk, buf, foc);
- }
+ foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+ sizeof(ip6h->saddr) +
+ sizeof(ip6h->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
#endif
return false;
}
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
+ */
+static void tcp_fastopen_cookie_gen(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (ctx)
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
+ rcu_read_unlock();
+}
/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
* queue this additional data / FIN.
@@ -212,6 +200,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
tcp_fin(sk);
}
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *orig,
+ struct tcp_fastopen_cookie *valid_foc)
+{
+ struct tcp_fastopen_cookie search_foc = { .len = -1 };
+ struct tcp_fastopen_cookie *foc = valid_foc;
+ struct tcp_fastopen_context *ctx;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (!ctx)
+ goto out;
+ for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
+ if (tcp_fastopen_cookie_match(foc, orig)) {
+ ret = i + 1;
+ goto out;
+ }
+ foc = &search_foc;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct request_sock *req)
@@ -327,6 +344,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
+ int ret = 0;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -342,31 +360,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
- if (foc->len >= 0 && /* Client presents or requests a cookie */
- tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
- foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
- foc->len == valid_foc.len &&
- !memcmp(foc->val, valid_foc.val, foc->len)) {
- /* Cookie is valid. Create a (full) child socket to accept
- * the data in SYN before returning a SYN-ACK to ack the
- * data. If we fail to create the socket, fall back and
- * ack the ISN only but includes the same cookie.
- *
- * Note: Data-less SYN with valid cookie is allowed to send
- * data in SYN_RECV state.
- */
+ if (foc->len == 0) {
+ /* Client requests a cookie. */
+ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+ } else if (foc->len > 0) {
+ ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+ &valid_foc);
+ if (!ret) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else {
+ /* Cookie is valid. Create a (full) child socket to
+ * accept the data in SYN before returning a SYN-ACK to
+ * ack the data. If we fail to create the socket, fall
+ * back and ack the ISN only but includes the same
+ * cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to
+ * send data in SYN_RECV state.
+ */
fastopen:
- child = tcp_fastopen_create_child(sk, skb, req);
- if (child) {
- foc->len = -1;
+ child = tcp_fastopen_create_child(sk, skb, req);
+ if (child) {
+ if (ret == 2) {
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+ } else {
+ foc->len = -1;
+ }
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return child;
+ }
NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
- return child;
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
}
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (foc->len > 0) /* Client presents an invalid cookie */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-
+ }
valid_foc.exp = foc->exp;
*foc = valid_foc;
return NULL;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d95ee40df6c2..b71efeb0ae5b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk,
void (*cad)(struct sock *sk, u32 ack_seq))
{
icsk->icsk_clean_acked = cad;
- static_branch_inc(&clean_acked_data_enabled.key);
+ static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cfa81190a1b1..d57641cb3477 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
int genhash;
struct sock *sk1 = NULL;
#endif
- struct net *net;
+ u64 transmit_time = 0;
struct sock *ctl_sk;
+ struct net *net;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -766,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ transmit_time = tcp_transmit_time(sk);
+ }
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
@@ -808,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
struct sock *ctl_sk;
+ u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -858,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c35731816e2..8bcaf2586b68 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
-
+ tcptw->tw_tx_delay = tp->tcp_tx_delay;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+ tw->tw_txhash = sk->sk_txhash;
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ebc33d1c9e5..4af1f5dae9d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
+ tcp_add_tx_delay(skb, tp);
+
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) {
@@ -2239,6 +2241,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+ tcp_sk(sk)->tcp_tx_delay) {
+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+ * approximate our needs assuming an ~100% skb->truesize overhead.
+ * USEC_PER_SEC is approximated by 2^20.
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+ */
+ extra_bytes >>= (20 - 1);
+ limit += extra_bytes;
+ }
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
@@ -3217,6 +3231,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
int tcp_header_size;
struct tcphdr *th;
int mss;
+ u64 now;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
@@ -3248,13 +3263,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
+ now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
skb->skb_mstamp_ns = cookie_init_timestamp(req);
else
#endif
{
- skb->skb_mstamp_ns = tcp_clock_ns();
+ skb->skb_mstamp_ns = now;
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
@@ -3297,8 +3313,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
- /* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp = 0;
+ skb->skb_mstamp_ns = now;
+ tcp_add_tx_delay(skb, tp);
+
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eed59c847722..1b971bd95786 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
-/* IPCB reference means this can not be used from early demux */
-static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
- return true;
-#endif
- return false;
-}
-
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
@@ -364,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
@@ -420,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
+ int dif, int sdif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
@@ -432,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ daddr, hnum, dif, sdif);
if (score > badness) {
if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
@@ -460,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
- bool exact_dif = udp_lib_exact_dif_match(net, skb);
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -468,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
+ hslot2, skb);
if (!result) {
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
@@ -476,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
- exact_dif, hslot2, skb);
+ hslot2, skb);
}
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 9763464a75d7..a3908e55ed89 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -208,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
gso_skb->destructor = NULL;
segs = skb_segment(gso_skb, features);
- if (unlikely(IS_ERR_OR_NULL(segs))) {
+ if (IS_ERR_OR_NULL(segs)) {
if (copy_dtor)
gso_skb->destructor = sock_wfree;
return segs;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 081bb517e40d..521e3203e83a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
+ /* prefix routes only use builtin fib6_nh */
+ if (rt->nh)
continue;
- if (no_gw && rt->fib6_nh.fib_nh_gw_family)
+
+ if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
+ continue;
+ if (no_gw && rt->fib6_nh->fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
@@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
if (ifa->ifa_scope == RT_SCOPE_LINK)
@@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct fib6_info *rt = ifa->rt;
+ /* host routes only use builtin fib6_nh */
+ struct fib6_nh *nh = ifa->rt->fib6_nh;
int cpu;
rcu_read_lock();
ifa->rt->dst_nopolicy = val ? true : false;
- if (rt->rt6i_pcpu) {
+ if (nh->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
- rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
addrconf_set_nopolicy(*rtp, val);
}
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5b1246635e02..783f3c1466da 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+{
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.fib6_select_path = eafnosupport_fib6_select_path,
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
+ .ip6_del_rt = eafnosupport_ip6_del_rt,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 5352708b7b2d..7382a927d1eb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -208,7 +208,7 @@ lookup_protocol:
np->mc_loop = 1;
np->mc_all = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->repflow = net->ipv6.sysctl.flowlabel_reflect;
+ np->repflow = net->ipv6.sysctl.flowlabel_reflect & 1;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
@@ -922,6 +922,9 @@ static const struct ipv6_stub ipv6_stub_impl = {
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.fib6_nh_init = fib6_nh_init,
.fib6_nh_release = fib6_nh_release,
+ .fib6_update_sernum = fib6_update_sernum_stub,
+ .fib6_rt_update = fib6_rt_update,
+ .ip6_del_rt = ip6_del_rt,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index bcfae13409b5..d22b6c140f23 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
}
- dst_hold(&net->ipv6.ip6_null_entry->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
return &net->ipv6.ip6_null_entry->dst;
}
@@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
again:
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
- dst_hold(&rt->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
out:
res->rt6 = rt;
return err;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 375b4b4f9bf5..12906301ec7b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -75,9 +75,9 @@
*
* On SMP we have one ICMP socket per-cpu.
*/
-static inline struct sock *icmpv6_sk(struct net *net)
+static struct sock *icmpv6_sk(struct net *net)
{
- return *this_cpu_ptr(net->ipv6.icmp_sk);
+ return this_cpu_read(*net->ipv6.icmp_sk);
}
static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b2a55f300318..cf60fae9533b 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, &in6addr_any, hnum,
dif, sdif);
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 9180c8b6f764..49884f96232b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
struct fib6_info *f6i;
+ size_t sz = sizeof(*f6i);
- f6i = kzalloc(sizeof(*f6i), gfp_flags);
+ if (with_fib6_nh)
+ sz += sizeof(struct fib6_nh);
+
+ f6i = kzalloc(sz, gfp_flags);
if (!f6i)
return NULL;
- f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
- if (!f6i->rt6i_pcpu) {
- kfree(f6i);
- return NULL;
- }
-
+ /* fib6_siblings is a union with nh_list, so this initializes both */
INIT_LIST_HEAD(&f6i->fib6_siblings);
refcount_set(&f6i->fib6_ref, 1);
@@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
- struct rt6_exception_bucket *bucket;
WARN_ON(f6i->fib6_node);
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
- kfree(bucket);
-
- if (f6i->rt6i_pcpu) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
-
- free_percpu(f6i->rt6i_pcpu);
- }
-
- fib6_nh_release(&f6i->fib6_nh);
+ if (f6i->nh)
+ nexthop_put(f6i->nh);
+ else
+ fib6_nh_release(f6i->fib6_nh);
ip_fib_metrics_put(f6i->fib6_metrics);
-
kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
@@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
+ if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
}
return &rt->dst;
@@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
return call_fib6_notifier(nb, net, event_type, &info.info);
}
-static int call_fib6_entry_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib6_info *rt,
- struct netlink_ext_ack *extack)
+int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ rt->fib6_table->fib_seq++;
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_multipath_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
.info.extack = extack,
.rt = rt,
+ .nsiblings = nsiblings,
};
rt->fib6_table->fib_seq++;
@@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w)
struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
- res = rt6_dump_route(rt, w->args);
- if (res < 0) {
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
+ if (res >= 0) {
/* Frame is full, suspend walking */
w->leaf = rt;
+
+ /* We'll restart from this node, so if some routes were
+ * already dumped, skip them next time.
+ */
+ w->skip_in_node += res;
+
return 1;
}
+ w->skip_in_node = 0;
/* Multipath routes are dumped in one route with the
* RTA_MULTIPATH attribute. Jump 'rt' to point to the
@@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
if (cb->args[4] == 0) {
w->count = 0;
w->skip = 0;
+ w->skip_in_node = 0;
spin_lock_bh(&table->tb6_lock);
res = fib6_walk(net, w);
@@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->state = FWS_INIT;
w->node = w->root;
w->skip = w->count;
+ w->skip_in_node = 0;
} else
w->skip = 0;
@@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
+ .filter.dump_routes = true };
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct rt6_rtnl_dump_arg arg = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib6_walker *w;
@@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
struct rtmsg *rtm = nlmsg_data(nlh);
- arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
+ if (rtm->rtm_flags & RTM_F_PREFIX)
+ arg.filter.flags = RTM_F_PREFIX;
}
- /* fib entries are never clones */
- if (arg.filter.flags & RTM_F_CLONED)
- goto out;
-
w = (void *)cb->args[2];
if (!w) {
/* New dump:
@@ -895,16 +897,14 @@ insert_above:
return ln;
}
-static void fib6_drop_pcpu_from(struct fib6_info *f6i,
- const struct fib6_table *table)
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
+ const struct fib6_info *match,
+ const struct fib6_table *table)
{
int cpu;
- /* Make sure rt6_make_pcpu_route() wont add other percpu routes
- * while we are cleaning them here.
- */
- f6i->fib6_destroying = 1;
- mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+ if (!fib6_nh->rt6i_pcpu)
+ return;
/* release the reference to this fib entry from
* all of its cached pcpu routes
@@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
+
+ /* only dropping the 'from' reference if the cached route
+ * is using 'match'. The cached pcpu_rt->from only changes
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
+ * change from one fib6_info reference to another
+ */
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
@@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
}
}
+struct fib6_nh_pcpu_arg {
+ struct fib6_info *from;
+ const struct fib6_table *table;
+};
+
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_pcpu_arg *arg = _arg;
+
+ __fib6_drop_pcpu_from(nh, arg->from, arg->table);
+ return 0;
+}
+
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
+{
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
+ * while we are cleaning them here.
+ */
+ f6i->fib6_destroying = 1;
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+
+ if (f6i->nh) {
+ struct fib6_nh_pcpu_arg arg = {
+ .from = f6i,
+ .table = table
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
+ &arg);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = f6i->fib6_nh;
+ __fib6_drop_pcpu_from(fib6_nh, f6i, table);
+ }
+}
+
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
struct fib6_table *table = rt->fib6_table;
- if (rt->rt6i_pcpu)
- fib6_drop_pcpu_from(rt, table);
+ fib6_drop_pcpu_from(rt, table);
+
+ if (rt->nh && !list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@@ -1101,11 +1147,13 @@ next_iter:
add:
nlflags |= NLM_F_CREATE;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_ADD,
+ rt, extack);
+ if (err)
+ return err;
+ }
rcu_assign_pointer(rt->fib6_next, iter);
fib6_info_hold(rt);
@@ -1130,11 +1178,13 @@ add:
return -ENOENT;
}
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_REPLACE,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_REPLACE,
+ rt, extack);
+ if (err)
+ return err;
+ }
fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
@@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum_upto_root(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
@@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
err = fib6_add_rt2node(fn, rt, info, extack);
if (!err) {
+ if (rt->nh)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
}
@@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
if (plen == fn->fn_bit)
return fn;
- prev = fn;
+ if (fn->fn_flags & RTN_RTINFO)
+ prev = fn;
next:
/*
@@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fib6_purge_rt(rt, fn, net);
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+ if (!info->skip_notify_kernel)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+
fib6_info_release(rt);
}
@@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.w.func = fib6_clean_node;
c.w.count = 0;
c.w.skip = 0;
+ c.w.skip_in_node = 0;
c.func = func;
c.sernum = sernum;
c.arg = arg;
@@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
unsigned int flags = rt->fib6_flags;
const struct net_device *dev;
+ if (rt->nh)
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
@@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->fib6_nh.fib_nh_gw_family) {
+ if (fib6_nh->fib_nh_gw_family) {
flags |= RTF_GATEWAY;
- seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
} else {
seq_puts(seq, "00000000000000000000000000000000");
}
- dev = rt->fib6_nh.fib_nh_dev;
+ dev = fib6_nh->fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
flags, dev ? dev->name : "");
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 21efcd02f337..8e49fd62eea9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL;
}
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
-
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
return ip6_finish_output2(net, sk, skb);
}
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip6_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
@@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id,
+ struct ip6_fraglist_iter *iter)
+{
+ unsigned int first_len;
+ struct frag_hdr *fh;
+
+ /* BUILD HEADER */
+ *prevhdr = NEXTHDR_FRAGMENT;
+ iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!iter->tmp_hdr)
+ return -ENOMEM;
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->hlen = hlen;
+ iter->frag_id = frag_id;
+ iter->nexthdr = nexthdr;
+
+ __skb_pull(skb, hlen);
+ fh = __skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ fh->identification = frag_id;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
+
+void ip6_fraglist_prepare(struct sk_buff *skb,
+ struct ip6_fraglist_iter *iter)
+{
+ struct sk_buff *frag = iter->frag;
+ unsigned int hlen = iter->hlen;
+ struct frag_hdr *fh;
+
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = __skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+ iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = iter->nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(iter->offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = iter->frag_id;
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+ unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+ state->prevhdr = prevhdr;
+ state->nexthdr = nexthdr;
+ state->frag_id = frag_id;
+
+ state->hlen = hlen;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->hroom = hdr_room;
+ state->troom = needed_tailroom;
+
+ state->offset = 0;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
+{
+ u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
+ struct sk_buff *frag;
+ struct frag_hdr *fh;
+ unsigned int len;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left)
+ len &= ~7;
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+ state->hroom + state->troom, GFP_ATOMIC);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, state->hroom);
+ skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+ frag->transport_header = (frag->network_header + state->hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = state->nexthdr;
+ fh->reserved = 0;
+ fh->identification = state->frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+ len));
+ state->left -= len;
+
+ fh->frag_off = htons(state->offset);
+ if (state->left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+ state->ptr += len;
+ state->offset += len;
+
+ return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
@@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL;
- struct ipv6hdr *tmp_hdr;
- struct frag_hdr *fh;
- unsigned int mtu, hlen, left, len, nexthdr_offset;
- int hroom, troom;
+ struct ip6_frag_state state;
+ unsigned int mtu, hlen, nexthdr_offset;
+ int hroom, err = 0;
__be32 frag_id;
- int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb->truesize -= frag->truesize;
}
- err = 0;
- offset = 0;
- /* BUILD HEADER */
-
- *prevhdr = NEXTHDR_FRAGMENT;
- tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
- if (!tmp_hdr) {
- err = -ENOMEM;
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
goto fail;
- }
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
-
- __skb_pull(skb, hlen);
- fh = __skb_push(skb, sizeof(struct frag_hdr));
- __skb_push(skb, hlen);
- skb_reset_network_header(skb);
- memcpy(skb_network_header(skb), tmp_hdr, hlen);
-
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(IP6_MF);
- fh->identification = frag_id;
-
- first_len = skb_pagelen(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- ipv6_hdr(skb)->payload_len = htons(first_len -
- sizeof(struct ipv6hdr));
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- fh = __skb_push(frag, sizeof(struct frag_hdr));
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), tmp_hdr,
- hlen);
- offset += skb->len - hlen - sizeof(struct frag_hdr);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(offset);
- if (frag->next)
- fh->frag_off |= htons(IP6_MF);
- fh->identification = frag_id;
- ipv6_hdr(frag)->payload_len =
- htons(frag->len -
- sizeof(struct ipv6hdr));
- ip6_copy_metadata(frag, skb);
- }
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb_mark_not_on_list(skb);
+ skb = ip6_fraglist_next(&iter);
}
- kfree(tmp_hdr);
+ kfree(iter.tmp_hdr);
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
@@ -766,91 +891,26 @@ slow_path_clean:
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- troom = rt->dst.dev->needed_tailroom;
+ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+ LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+ &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- u8 *fragnexthdr_offset;
-
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /* Allocate buffer */
- frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
- hroom + troom, GFP_ATOMIC);
- if (!frag) {
- err = -ENOMEM;
+ while (state.left > 0) {
+ frag = ip6_frag_next(skb, &state);
+ if (IS_ERR(frag)) {
+ err = PTR_ERR(frag);
goto fail;
}
/*
- * Set up data on packet
- */
-
- ip6_copy_metadata(frag, skb);
- skb_reserve(frag, hroom);
- skb_put(frag, len + hlen + sizeof(struct frag_hdr));
- skb_reset_network_header(frag);
- fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
- frag->transport_header = (frag->network_header + hlen +
- sizeof(struct frag_hdr));
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
- if (skb->sk)
- skb_set_owner_w(frag, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
- skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
-
- fragnexthdr_offset = skb_network_header(frag);
- fragnexthdr_offset += prevhdr - skb_network_header(skb);
- *fragnexthdr_offset = NEXTHDR_FRAGMENT;
-
- /*
- * Build fragment header.
- */
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->identification = frag_id;
-
- /*
- * Copy a block of the IP datagram.
- */
- BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
- len));
- left -= len;
-
- fh->frag_off = htons(offset);
- if (left > 0)
- fh->frag_off |= htons(IP6_MF);
- ipv6_hdr(frag)->payload_len = htons(frag->len -
- sizeof(struct ipv6hdr));
-
- ptr += len;
- offset += len;
-
- /*
* Put this fragment into the sending queue.
*/
err = output(net, sk, frag);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 09dd2edfb868..083cc1c94cd3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb)
!in6_dev->cnf.accept_ra_rtr_pref)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
-
+ /* routes added from RAs do not use nexthop objects */
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
if (rt) {
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
@@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 1240ccd57f39..61819ed858b1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,6 +16,9 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
{
@@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
}
EXPORT_SYMBOL_GPL(__nf_ip6_route);
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ struct ip6_frag_state state;
+ u8 *prevhdr, nexthdr = 0;
+ unsigned int mtu, hlen;
+ int hroom, err = 0;
+ __be32 frag_id;
+
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto blackhole;
+ hlen = err;
+ nexthdr = *prevhdr;
+
+ mtu = skb->dev->mtu;
+ if (frag_max_size > mtu ||
+ frag_max_size < IPV6_MIN_MTU)
+ goto blackhole;
+
+ mtu = frag_max_size;
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto blackhole;
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ hroom = LL_RESERVED_SPACE(skb->dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ if (skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag2) {
+ if (frag2->len > mtu ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag2))
+ goto slow_path;
+ }
+
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto blackhole;
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down.
+ */
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip6_fraglist_next(&iter);
+ }
+
+ kfree(iter.tmp_hdr);
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+ LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+ &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip6_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
static const struct nf_ipv6_ops ipv6ops = {
#if IS_MODULE(CONFIG_IPV6)
.chk_addr = ipv6_chk_addr,
.route_me_harder = ip6_route_me_harder,
.dev_get_saddr = ipv6_dev_get_saddr,
.route = __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ .cookie_init_sequence = __cookie_v6_init_sequence,
+ .cookie_v6_check = __cookie_v6_check,
+#endif
#endif
.route_input = ip6_route_input,
.fragment = ip6_fragment,
.reroute = nf_ip6_reroute,
+#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ .br_defrag = nf_ct_frag6_gather,
+#endif
+#if IS_MODULE(CONFIG_IPV6)
+ .br_fragment = br_ip6_fragment,
+#endif
};
int __init ipv6_netfilter_init(void)
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 41325d517478..e77ea1ed5edd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -3,272 +3,11 @@
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct ipv6hdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- struct ipv6hdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
- iph->nexthdr = IPPROTO_TCP;
- iph->saddr = *saddr;
- iph->daddr = *daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct ipv6hdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_TCP;
- fl6.saddr = niph->saddr;
- fl6.daddr = niph->daddr;
- fl6.fl6_sport = nth->source;
- fl6.fl6_dport = nth->dest;
- security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- goto free_nskb;
- }
- dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
- if (IS_ERR(dst))
- goto free_nskb;
-
- skb_dst_set(nskb, dst);
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip6_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
@@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(net, skb, th, &opts);
+ synproxy_send_client_synack_ipv6(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq))) {
consume_skb(skb);
return NF_STOLEN;
} else {
@@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv6_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- __be16 frag_off;
- u8 nexthdr;
- int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb))
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (thoff < 0 || nexthdr != IPPROTO_TCP)
- return NF_ACCEPT;
-
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv6_synproxy_ops[] = {
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg6_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref6 == 0) {
- err = nf_register_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv6_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref6++;
return err;
}
@@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref6--;
- if (snet->hook_ref6 == 0)
- nf_unregister_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
+ nf_synproxy_ipv6_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3f7d4691c423..a22100b1cf2c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -2,7 +2,7 @@
/*
* IPv6 raw table, a port of the IPv4 raw table to IPv6
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 84322ce81d70..398e1df41406 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -54,26 +54,21 @@ static struct inet_frags nf_frags;
static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
- .data = &init_net.nf_frag.frags.timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_frag6_low_thresh",
- .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.nf_frag.frags.high_thresh
},
{
.procname = "nf_conntrack_frag6_high_thresh",
- .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.nf_frag.frags.low_thresh
},
{ }
};
@@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
GFP_KERNEL);
if (table == NULL)
goto err_alloc;
-
- table[0].data = &net->nf_frag.frags.timeout;
- table[1].data = &net->nf_frag.frags.low_thresh;
- table[1].extra2 = &net->nf_frag.frags.high_thresh;
- table[2].data = &net->nf_frag.frags.high_thresh;
- table[2].extra1 = &net->nf_frag.frags.low_thresh;
- table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
}
+ table[0].data = &net->nf_frag.fqdir->timeout;
+ table[1].data = &net->nf_frag.fqdir->low_thresh;
+ table[1].extra2 = &net->nf_frag.fqdir->high_thresh;
+ table[2].data = &net->nf_frag.fqdir->high_thresh;
+ table[2].extra1 = &net->nf_frag.fqdir->low_thresh;
+ table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh;
+
hdr = register_net_sysctl(net, "net/netfilter", table);
if (hdr == NULL)
goto err_reg;
@@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
/* Creation primitives. */
@@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->nf_frag.frags, &key);
+ q = inet_frag_find(net->nf_frag.fqdir, &key);
if (!q)
return NULL;
@@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net)
{
int res;
- net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->nf_frag.frags.f = &nf_frags;
-
- res = inet_frags_init_net(&net->nf_frag.frags);
+ res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
if (res < 0)
return res;
+
+ net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
return res;
}
+static void nf_ct_net_pre_exit(struct net *net)
+{
+ fqdir_pre_exit(net->nf_frag.fqdir);
+}
+
static void nf_ct_net_exit(struct net *net)
{
nf_ct_frags6_sysctl_unregister(net);
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
}
static struct pernet_operations nf_ct_net_ops = {
- .init = nf_ct_net_init,
- .exit = nf_ct_net_exit,
+ .init = nf_ct_net_init,
+ .pre_exit = nf_ct_net_pre_exit,
+ .exit = nf_ct_net_exit,
};
static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 4a8da679866e..bbff3e02e302 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
- atomic_read(&net->ipv6.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv6.frags));
+ atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv6.fqdir));
return 0;
}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b2b2c0c38b87..ca05b16f1bb9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
static struct frag_queue *
@@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
IPV6_ADDR_LINKLOCAL)))
key.iif = 0;
- q = inet_frag_find(&net->ipv6.frags, &key);
+ q = inet_frag_find(net->ipv6.fqdir, &key);
if (!q)
return NULL;
@@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
fq->ecn |= ecn;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
fragsize = -skb_network_offset(skb) + skb->len;
if (fragsize > fq->q.max_size)
@@ -250,7 +248,7 @@ err:
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
{
- struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+ struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
void *reasm_data;
int payload_len;
@@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = {
static struct ctl_table ip6_frags_ns_ctl_table[] = {
{
.procname = "ip6frag_high_thresh",
- .data = &init_net.ipv6.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv6.frags.low_thresh
},
{
.procname = "ip6frag_low_thresh",
- .data = &init_net.ipv6.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv6.frags.high_thresh
},
{
.procname = "ip6frag_time",
- .data = &init_net.ipv6.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv6.frags.high_thresh;
- table[0].extra1 = &net->ipv6.frags.low_thresh;
- table[1].data = &net->ipv6.frags.low_thresh;
- table[1].extra2 = &net->ipv6.frags.high_thresh;
- table[2].data = &net->ipv6.frags.timeout;
}
+ table[0].data = &net->ipv6.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv6.fqdir->low_thresh;
+ table[1].data = &net->ipv6.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv6.fqdir->high_thresh;
+ table[2].data = &net->ipv6.fqdir->timeout;
hdr = register_net_sysctl(net, "net/ipv6", table);
if (!hdr)
@@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
- net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->ipv6.frags.f = &ip6_frags;
-
- res = inet_frags_init_net(&net->ipv6.frags);
+ res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
if (res < 0)
return res;
+ net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = ip6_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
return res;
}
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv6.fqdir);
+}
+
static void __net_exit ipv6_frags_exit_net(struct net *net)
{
ip6_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
}
static struct pernet_operations ip6_frags_ops = {
- .init = ipv6_frags_init_net,
- .exit = ipv6_frags_exit_net,
+ .init = ipv6_frags_init_net,
+ .pre_exit = ipv6_frags_pre_exit_net,
+ .exit = ipv6_frags_exit_net,
};
static const struct rhashtable_params ip6_rhash_params = {
@@ -587,8 +585,8 @@ err_protocol:
void ipv6_frag_exit(void)
{
- inet_frags_fini(&ip6_frags);
ip6_frags_sysctl_unregister();
unregister_pernet_subsys(&ip6_frags_ops);
inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ inet_frags_fini(&ip6_frags);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 97a843cf164c..7556275b1cef 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict);
-static size_t rt6_nlmsg_size(struct fib6_info *rt);
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
- if (!match->fib6_nsiblings || have_oif_match)
+ if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
goto out;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
- if (!fl6->mp_hash)
+ if (!fl6->mp_hash &&
+ (!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
+ if (unlikely(match->nh)) {
+ nexthop_path_fib6_result(res, fl6->mp_hash);
+ return;
+ }
+
+ if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
- const struct fib6_nh *nh = &sibling->fib6_nh;
+ const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
@@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
out:
res->f6i = match;
- res->nh = &match->fib6_nh;
+ res->nh = match->fib6_nh;
}
/*
@@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
return false;
}
+struct fib6_nh_dm_arg {
+ struct net *net;
+ const struct in6_addr *saddr;
+ int oif;
+ int flags;
+ struct fib6_nh *nh;
+};
+
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_dm_arg *arg = _arg;
+
+ arg->nh = nh;
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
+ arg->flags);
+}
+
+/* returns fib6_nh from nexthop or NULL */
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
+ struct fib6_result *res,
+ const struct in6_addr *saddr,
+ int oif, int flags)
+{
+ struct fib6_nh_dm_arg arg = {
+ .net = net,
+ .saddr = saddr,
+ .oif = oif,
+ .flags = flags,
+ };
+
+ if (nexthop_is_blackhole(nh))
+ return NULL;
+
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
+ return arg.nh;
+
+ return NULL;
+}
+
static void rt6_device_match(struct net *net, struct fib6_result *res,
const struct in6_addr *saddr, int oif, int flags)
{
@@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
struct fib6_nh *nh;
if (!oif && ipv6_addr_any(saddr)) {
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
if (!(nh->fib_nh_flags & RTNH_F_DEAD))
goto out;
}
for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
- nh = &spf6i->fib6_nh;
- if (__rt6_device_match(net, nh, saddr, oif, flags)) {
+ bool matched = false;
+
+ if (unlikely(spf6i->nh)) {
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
+ oif, flags);
+ if (nh)
+ matched = true;
+ } else {
+ nh = spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
+ matched = true;
+ }
+ if (matched) {
res->f6i = spf6i;
goto out;
}
@@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
if (oif && flags & RT6_LOOKUP_F_IFACE) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
goto out;
}
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
+
if (nh->fib_nh_flags & RTNH_F_DEAD) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
}
out:
res->nh = nh;
res->fib6_type = res->f6i->fib6_type;
res->fib6_flags = res->f6i->fib6_flags;
+ return;
+
+out_blackhole:
+ res->fib6_flags |= RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -691,6 +766,24 @@ out:
return rc;
}
+struct fib6_nh_frl_arg {
+ u32 flags;
+ int oif;
+ int strict;
+ int *mpri;
+ bool *do_rr;
+ struct fib6_nh *nh;
+};
+
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_frl_arg *arg = _arg;
+
+ arg->nh = nh;
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
+ arg->mpri, arg->do_rr);
+}
+
static void __find_rr_leaf(struct fib6_info *f6i_start,
struct fib6_info *nomatch, u32 metric,
struct fib6_result *res, struct fib6_info **cont,
@@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
for (f6i = f6i_start;
f6i && f6i != nomatch;
f6i = rcu_dereference(f6i->fib6_next)) {
+ bool matched = false;
struct fib6_nh *nh;
if (cont && f6i->fib6_metric != metric) {
@@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
if (fib6_check_expired(f6i))
continue;
- nh = &f6i->fib6_nh;
- if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
+ if (unlikely(f6i->nh)) {
+ struct fib6_nh_frl_arg arg = {
+ .flags = f6i->fib6_flags,
+ .oif = oif,
+ .strict = strict,
+ .mpri = mpri,
+ .do_rr = do_rr
+ };
+
+ if (nexthop_is_blackhole(f6i->nh)) {
+ res->fib6_flags = RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->f6i = f6i;
+ res->nh = nexthop_fib6_nh(f6i->nh);
+ return;
+ }
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
+ &arg)) {
+ matched = true;
+ nh = arg.nh;
+ }
+ } else {
+ nh = f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
+ mpri, do_rr))
+ matched = true;
+ }
+ if (matched) {
res->f6i = f6i;
res->nh = nh;
res->fib6_flags = f6i->fib6_flags;
@@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
out:
if (!res->f6i) {
res->f6i = net->ipv6.fib6_null_entry;
- res->nh = &res->f6i->fib6_nh;
+ res->nh = res->f6i->fib6_nh;
res->fib6_flags = res->f6i->fib6_flags;
res->fib6_type = res->f6i->fib6_type;
}
@@ -1114,6 +1234,8 @@ restart:
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
goto out;
+ } else if (res.fib6_flags & RTF_REJECT) {
+ goto do_create;
}
fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
@@ -1125,6 +1247,7 @@ restart:
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
} else {
+do_create:
rt = ip6_create_rt_rcu(&res);
}
@@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
- struct rt6_info *pcpu_rt, **p;
-
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
- pcpu_rt = *p;
+ struct rt6_info *pcpu_rt;
- if (pcpu_rt)
- ip6_hold_safe(NULL, &pcpu_rt);
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
return pcpu_rt;
}
@@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(res);
- if (!pcpu_rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return net->ipv6.ip6_null_entry;
- }
+ if (!pcpu_rt)
+ return NULL;
- dst_hold(&pcpu_rt->dst);
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
@@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+
+ if (lock)
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+ else
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+ /* remove bucket flushed bit if set */
+ if (bucket) {
+ unsigned long p = (unsigned long)bucket;
+
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ }
+
+ return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+ unsigned long p = (unsigned long)bucket;
+
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+ unsigned long p;
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+
+ p = (unsigned long)bucket;
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+}
+
static int rt6_insert_exception(struct rt6_info *nrt,
const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
+ struct fib6_info *f6i = res->f6i;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *f6i = res->f6i;
+ struct fib6_nh *nh = res->nh;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
- if (f6i->exception_bucket_flushed) {
- err = -EINVAL;
- goto out;
- }
-
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
@@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
- rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+ err = -EINVAL;
+ goto out;
}
#ifdef CONFIG_IPV6_SUBTREES
@@ -1539,7 +1707,7 @@ out:
return err;
}
-void rt6_flush_exceptions(struct fib6_info *rt)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt)
int i;
spin_lock_bh(&rt6_exception_lock);
- /* Prevent rt6_insert_exception() to recreate the bucket list */
- rt->exception_bucket_flushed = 1;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ if (!from)
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
- rt6_remove_exception(bucket, rt6_ex);
- WARN_ON_ONCE(bucket->depth);
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+ if (!from ||
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
-
out:
spin_unlock_bh(&rt6_exception_lock);
}
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_info *f6i = arg;
+
+ fib6_nh_flush_exceptions(nh, f6i);
+
+ return 0;
+}
+
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+ if (f6i->nh)
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
+ f6i);
+ else
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+}
+
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
@@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
src_key = saddr;
find_ex:
#endif
- bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
@@ -1612,25 +1801,20 @@ find_ex:
}
/* Remove the passed in cached rt from the hash table that contains it */
-static int rt6_remove_exception_rt(struct rt6_info *rt)
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *from;
int err;
- from = rcu_dereference(rt->from);
- if (!from ||
- !(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
- if (!rcu_access_pointer(from->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
return err;
}
-/* Find rt6_ex which contains the passed in rt cache and
- * refresh its stamp
- */
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+struct fib6_nh_excptn_arg {
+ struct rt6_info *rt;
+ int plen;
+};
+
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_excptn_arg *arg = _arg;
+ int err;
+
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
+ if (err == 0)
+ return 1;
+
+ return 0;
+}
+
+static int rt6_remove_exception_rt(struct rt6_info *rt)
{
- struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
- struct rt6_exception *rt6_ex;
struct fib6_info *from;
- rcu_read_lock();
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
- goto unlock;
+ return -EINVAL;
- bucket = rcu_dereference(from->rt6i_exception_bucket);
+ if (from->nh) {
+ struct fib6_nh_excptn_arg arg = {
+ .rt = rt,
+ .plen = from->fib6_src.plen
+ };
+ int rc;
+ /* rc = 1 means an entry was found */
+ rc = nexthop_for_each_fib6_nh(from->nh,
+ rt6_nh_remove_exception_rt,
+ &arg);
+ return rc ? 0 : -ENOENT;
+ }
+
+ return fib6_nh_remove_exception(from->fib6_nh,
+ from->fib6_src.plen, rt);
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
- rt6_ex = __rt6_find_exception_rcu(&bucket,
- &rt->rt6i_dst.addr,
- src_key);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
+}
+struct fib6_nh_match_arg {
+ const struct net_device *dev;
+ const struct in6_addr *gw;
+ struct fib6_nh *match;
+};
+
+/* determine if fib6_nh has given device and gateway */
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_match_arg *arg = _arg;
+
+ if (arg->dev != nh->fib_nh_dev ||
+ (arg->gw && !nh->fib_nh_gw_family) ||
+ (!arg->gw && nh->fib_nh_gw_family) ||
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
+ return 0;
+
+ arg->match = nh;
+
+ /* found a match, break the loop */
+ return 1;
+}
+
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+ struct fib6_nh *fib6_nh;
+
+ rcu_read_lock();
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ goto unlock;
+
+ if (from->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = rt->dst.dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
+
+ if (!arg.match)
+ return;
+ fib6_nh = arg.match;
+ } else {
+ fib6_nh = from->fib6_nh;
+ }
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
rcu_read_unlock();
}
@@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
- struct fib6_info *rt, int mtu)
+ const struct fib6_nh *nh, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
return;
@@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
- struct in6_addr *gateway)
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+ const struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
-void rt6_age_exceptions(struct fib6_info *rt,
- struct fib6_gc_args *gc_args,
- unsigned long now)
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
+struct fib6_nh_age_excptn_arg {
+ struct fib6_gc_args *gc_args;
+ unsigned long now;
+};
+
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_age_excptn_arg *arg = _arg;
+
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
+ return 0;
+}
+
+void rt6_age_exceptions(struct fib6_info *f6i,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ if (f6i->nh) {
+ struct fib6_nh_age_excptn_arg arg = {
+ .gc_args = gc_args,
+ .now = now
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
+ &arg);
+ } else {
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
+ }
+}
+
/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, struct fib6_result *res, int strict)
@@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
const struct sk_buff *skb, int flags)
{
struct fib6_result res = {};
- struct rt6_info *rt;
+ struct rt6_info *rt = NULL;
int strict = 0;
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+ !rcu_read_lock_held());
+
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
@@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
fib6_table_lookup(net, table, oif, fl6, &res, strict);
- if (res.f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- rcu_read_unlock();
- dst_hold(&rt->dst);
- return rt;
- }
+ if (res.f6i == net->ipv6.fib6_null_entry)
+ goto out;
fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
- if (ip6_hold_safe(net, &rt))
- dst_use_noref(&rt->dst, jiffies);
-
- rcu_read_unlock();
- return rt;
+ goto out;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
@@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
- struct rt6_info *uncached_rt;
-
- uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
-
- rcu_read_unlock();
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
- if (uncached_rt) {
- /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
- * No need for another dst_hold()
+ if (rt) {
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
+ * As rt6_uncached_list_add() does not consume refcnt,
+ * this refcnt is always returned to the caller even
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
- rt6_uncached_list_add(uncached_rt);
+ rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
- } else {
- uncached_rt = net->ipv6.ip6_null_entry;
- dst_hold(&uncached_rt->dst);
- }
+ rcu_read_unlock();
- return uncached_rt;
+ return rt;
+ }
} else {
/* Get a percpu copy */
-
- struct rt6_info *pcpu_rt;
-
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(&res);
+ rt = rt6_get_pcpu_route(&res);
- if (!pcpu_rt)
- pcpu_rt = rt6_make_pcpu_route(net, &res);
+ if (!rt)
+ rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
- rcu_read_unlock();
-
- return pcpu_rt;
}
+out:
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ ip6_hold_safe(net, &rt);
+ rcu_read_unlock();
+
+ return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -2090,11 +2376,12 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
return mhash >> 1;
}
+/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
- int flags = RT6_LOOKUP_F_HAS_SADDR;
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
@@ -2116,8 +2403,8 @@ void ip6_route_input(struct sk_buff *skb)
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
- skb_dst_set(skb,
- ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
+ &fl6, skb, flags));
}
static struct rt6_info *ip6_pol_route_output(struct net *net,
@@ -2129,8 +2416,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
bool any_src;
@@ -2138,6 +2426,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
+ /* This function does not take refcnt on the dst */
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
@@ -2145,6 +2434,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ flags |= RT6_LOOKUP_F_DST_NOREF;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
@@ -2157,6 +2447,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
+EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
+
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
+
+ return dst;
+}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@ -2381,10 +2693,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
rcu_read_unlock();
return;
}
- res.nh = &res.f6i->fib6_nh;
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt6->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev + gw. Should be impossible.
+ */
+ if (!arg.match) {
+ rcu_read_unlock();
+ return;
+ }
+
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
@@ -2491,6 +2824,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res,
return true;
}
+struct fib6_nh_rd_arg {
+ struct fib6_result *res;
+ struct flowi6 *fl6;
+ const struct in6_addr *gw;
+ struct rt6_info **ret;
+};
+
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_rd_arg *arg = _arg;
+
+ arg->res->nh = nh;
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
+}
+
/* Handle redirects */
struct ip6rd_flowi {
struct flowi6 fl6;
@@ -2506,6 +2854,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *ret = NULL;
struct fib6_result res = {};
+ struct fib6_nh_rd_arg arg = {
+ .res = &res,
+ .fl6 = fl6,
+ .gw = &rdfl->gateway,
+ .ret = &ret
+ };
struct fib6_info *rt;
struct fib6_node *fn;
@@ -2530,14 +2884,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
restart:
for_each_fib6_node_rt_rcu(fn) {
res.f6i = rt;
- res.nh = &rt->fib6_nh;
-
if (fib6_check_expired(rt))
continue;
if (rt->fib6_flags & RTF_REJECT)
break;
- if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
- goto out;
+ if (unlikely(rt->nh)) {
+ if (nexthop_is_blackhole(rt->nh))
+ continue;
+ /* on match, res->nh is filled in and potentially ret */
+ if (nexthop_for_each_fib6_nh(rt->nh,
+ fib6_nh_redirect_match,
+ &arg))
+ goto out;
+ } else {
+ res.nh = rt->fib6_nh;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
+ &ret))
+ goto out;
+ }
}
if (!rt)
@@ -2554,7 +2918,7 @@ restart:
}
res.f6i = rt;
- res.nh = &rt->fib6_nh;
+ res.nh = rt->fib6_nh;
out:
if (ret) {
ip6_hold_safe(net, &ret);
@@ -2781,10 +3145,9 @@ out:
return entries > rt_max_size;
}
-static struct rt6_info *ip6_nh_lookup_table(struct net *net,
- struct fib6_config *cfg,
- const struct in6_addr *gw_addr,
- u32 tbid, int flags)
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+ const struct in6_addr *gw_addr, u32 tbid,
+ int flags, struct fib6_result *res)
{
struct flowi6 fl6 = {
.flowi6_oif = cfg->fc_ifindex,
@@ -2792,25 +3155,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
.saddr = cfg->fc_prefsrc,
};
struct fib6_table *table;
- struct rt6_info *rt;
+ int err;
table = fib6_get_table(net, tbid);
if (!table)
- return NULL;
+ return -EINVAL;
if (!ipv6_addr_any(&cfg->fc_prefsrc))
flags |= RT6_LOOKUP_F_HAS_SADDR;
flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
- rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
- /* if table lookup failed, fall back to full lookup */
- if (rt == net->ipv6.ip6_null_entry) {
- ip6_rt_put(rt);
- rt = NULL;
- }
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
- return rt;
+ return err;
}
static int ip6_route_check_nh_onlink(struct net *net,
@@ -2818,29 +3179,19 @@ static int ip6_route_check_nh_onlink(struct net *net,
const struct net_device *dev,
struct netlink_ext_ack *extack)
{
- u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = &cfg->fc_gateway;
- u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
- struct fib6_info *from;
- struct rt6_info *grt;
+ struct fib6_result res = {};
int err;
- err = 0;
- grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
- if (grt) {
- rcu_read_lock();
- from = rcu_dereference(grt->from);
- if (!grt->dst.error &&
- /* ignore match if it is the default route */
- from && !ipv6_addr_any(&from->fib6_dst.addr) &&
- (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway or device mismatch");
- err = -EINVAL;
- }
- rcu_read_unlock();
-
- ip6_rt_put(grt);
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
+ /* ignore match if it is the default route */
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop has invalid gateway or device mismatch");
+ err = -EINVAL;
}
return err;
@@ -2853,47 +3204,50 @@ static int ip6_route_check_nh(struct net *net,
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
struct net_device *dev = _dev ? *_dev : NULL;
- struct rt6_info *grt = NULL;
+ int flags = RT6_LOOKUP_F_IFACE;
+ struct fib6_result res = {};
int err = -EHOSTUNREACH;
if (cfg->fc_table) {
- int flags = RT6_LOOKUP_F_IFACE;
-
- grt = ip6_nh_lookup_table(net, cfg, gw_addr,
- cfg->fc_table, flags);
- if (grt) {
- if (grt->rt6i_flags & RTF_GATEWAY ||
- (dev && dev != grt->dst.dev)) {
- ip6_rt_put(grt);
- grt = NULL;
- }
- }
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
+ cfg->fc_table, flags, &res);
+ /* gw_addr can not require a gateway or resolve to a reject
+ * route. If a device is given, it must match the result.
+ */
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family ||
+ (dev && dev != res.nh->fib_nh_dev))
+ err = -EHOSTUNREACH;
}
- if (!grt)
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
+ if (err < 0) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ };
+
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family)
+ err = -EHOSTUNREACH;
- if (!grt)
- goto out;
+ if (err)
+ return err;
+
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
+ }
+ err = 0;
if (dev) {
- if (dev != grt->dst.dev) {
- ip6_rt_put(grt);
- goto out;
- }
+ if (dev != res.nh->fib_nh_dev)
+ err = -EHOSTUNREACH;
} else {
- *_dev = dev = grt->dst.dev;
- *idev = grt->rt6i_idev;
+ *_dev = dev = res.nh->fib_nh_dev;
dev_hold(dev);
- in6_dev_hold(grt->rt6i_idev);
+ *idev = in6_dev_get(dev);
}
- if (!(grt->rt6i_flags & RTF_GATEWAY))
- err = 0;
-
- ip6_rt_put(grt);
-
-out:
return err;
}
@@ -2934,11 +3288,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
goto out;
}
+ rcu_read_lock();
+
if (cfg->fc_flags & RTNH_F_ONLINK)
err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
else
err = ip6_route_check_nh(net, cfg, _dev, idev);
+ rcu_read_unlock();
+
if (err)
goto out;
}
@@ -3039,7 +3397,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
goto out;
}
}
- goto set_dev;
+ goto pcpu_alloc;
}
if (cfg->fc_flags & RTF_GATEWAY) {
@@ -3075,7 +3433,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
-set_dev:
+
+pcpu_alloc:
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
+ }
+
fib6_nh->fib_nh_dev = dev;
fib6_nh->fib_nh_oif = dev->ifindex;
err = 0;
@@ -3095,6 +3460,38 @@ out:
void fib6_nh_release(struct fib6_nh *fib6_nh)
{
+ struct rt6_exception_bucket *bucket;
+
+ rcu_read_lock();
+
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+ if (bucket) {
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+ kfree(bucket);
+ }
+
+ rcu_read_unlock();
+
+ if (fib6_nh->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+
+ free_percpu(fib6_nh->rt6i_pcpu);
+ }
+
fib_nh_common_release(&fib6_nh->nh_common);
}
@@ -3104,7 +3501,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
{
struct net *net = cfg->fc_nlinfo.nl_net;
struct fib6_info *rt = NULL;
+ struct nexthop *nh = NULL;
struct fib6_table *table;
+ struct fib6_nh *fib6_nh;
int err = -EINVAL;
int addr_type;
@@ -3140,6 +3539,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
#endif
+ if (cfg->fc_nh_id) {
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out;
+ }
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out;
+ }
err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
@@ -3157,7 +3566,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags);
+ rt = fib6_info_alloc(gfp_flags, !nh);
if (!rt)
goto out;
@@ -3197,19 +3606,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
- err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
- if (err)
- goto out;
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ goto out;
+ }
+ if (rt->fib6_src.plen) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto out;
+ }
+ rt->nh = nh;
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+ } else {
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ if (err)
+ goto out;
- /* We cannot add true routes via loopback here,
- * they would result in kernel looping; promote them to reject routes
- */
- addr_type = ipv6_addr_type(&cfg->fc_dst);
- if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
- rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ fib6_nh = rt->fib6_nh;
+
+ /* We cannot add true routes via loopback here, they would
+ * result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+ addr_type))
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ }
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
- struct net_device *dev = fib6_info_nh_dev(rt);
+ struct net_device *dev = fib6_nh->fib_nh_dev;
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
@@ -3301,6 +3726,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
info->skip_notify = 1;
}
+ info->skip_notify_kernel = 1;
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt,
+ rt->fib6_nsiblings,
+ NULL);
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings,
fib6_siblings) {
@@ -3323,7 +3754,7 @@ out_put:
return err;
}
-static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
int rc = -ESRCH;
@@ -3339,10 +3770,49 @@ out:
return rc;
}
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
+ struct fib6_nh *nh)
+{
+ struct fib6_result res = {
+ .f6i = rt,
+ .nh = nh,
+ };
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
+ if (rt_cache)
+ return __ip6_del_cached_rt(rt_cache, cfg);
+
+ return 0;
+}
+
+struct fib6_nh_del_cached_rt_arg {
+ struct fib6_config *cfg;
+ struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
+ int rc;
+
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+ return rc != -ESRCH ? rc : 0;
+}
+
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+ struct fib6_nh_del_cached_rt_arg arg = {
+ .cfg = cfg,
+ .f6i = f6i
+ };
+
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt_cache;
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
@@ -3365,26 +3835,45 @@ static int ip6_route_del(struct fib6_config *cfg,
for_each_fib6_node_rt_rcu(fn) {
struct fib6_nh *nh;
+ if (rt->nh && cfg->fc_nh_id &&
+ rt->nh->id != cfg->fc_nh_id)
+ continue;
+
if (cfg->fc_flags & RTF_CACHE) {
- struct fib6_result res = {
- .f6i = rt,
- };
- int rc;
-
- rt_cache = rt6_find_cached_rt(&res,
- &cfg->fc_dst,
- &cfg->fc_src);
- if (rt_cache) {
- rc = ip6_del_cached_rt(rt_cache, cfg);
- if (rc != -ESRCH) {
- rcu_read_unlock();
- return rc;
- }
+ int rc = 0;
+
+ if (rt->nh) {
+ rc = ip6_del_cached_rt_nh(cfg, rt);
+ } else if (cfg->fc_nh_id) {
+ continue;
+ } else {
+ nh = rt->fib6_nh;
+ rc = ip6_del_cached_rt(cfg, rt, nh);
+ }
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
}
continue;
}
- nh = &rt->fib6_nh;
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+ continue;
+ if (cfg->fc_protocol &&
+ cfg->fc_protocol != rt->fib6_protocol)
+ continue;
+
+ if (rt->nh) {
+ if (!fib6_info_hold_safe(rt))
+ continue;
+ rcu_read_unlock();
+
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ }
+ if (cfg->fc_nh_id)
+ continue;
+
+ nh = rt->fib6_nh;
if (cfg->fc_ifindex &&
(!nh->fib_nh_dev ||
nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@@ -3392,10 +3881,6 @@ static int ip6_route_del(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
- continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
- continue;
if (!fib6_info_hold_safe(rt))
continue;
rcu_read_unlock();
@@ -3506,7 +3991,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (!res.f6i)
goto out;
- res.nh = &res.f6i->fib6_nh;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev. Should be impossible
+ */
+ if (!arg.match)
+ goto out;
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
@@ -3558,12 +4061,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
+ /* these routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
- !rt->fib6_nh.fib_nh_gw_family)
+ !rt->fib6_nh->fib_nh_gw_family)
continue;
- if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
continue;
if (!fib6_info_hold_safe(rt))
continue;
@@ -3621,8 +4127,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- struct fib6_nh *nh = &rt->fib6_nh;
+ struct fib6_nh *nh;
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ continue;
+
+ nh = rt->fib6_nh;
if (dev == nh->fib_nh_dev &&
((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
ipv6_addr_equal(&nh->fib_nh_gw6, addr))
@@ -3873,7 +4384,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
+ if (!rt->nh &&
+ ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
rt != net->ipv6.fib6_null_entry &&
ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
@@ -3901,18 +4413,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
+ struct fib6_nh *nh;
+
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ return 0;
+ nh = rt->fib6_nh;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- rt->fib6_nh.fib_nh_gw_family &&
- ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
return -1;
- }
/* Further clean up cached routes in exception table.
* This is needed because cached route may have a different
* gateway than its 'parent' in the case of an ip redirect.
*/
- rt6_exceptions_clean_tohost(rt, gateway);
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
return 0;
}
@@ -3950,11 +4466,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
return NULL;
}
+/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
- if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
- (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
- ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
return true;
return false;
@@ -3966,11 +4483,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
int total = 0;
if (!rt6_is_dead(rt))
- total += rt->fib6_nh.fib_nh_weight;
+ total += rt->fib6_nh->fib_nh_weight;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
- total += iter->fib6_nh.fib_nh_weight;
+ total += iter->fib6_nh->fib_nh_weight;
}
return total;
@@ -3981,11 +4498,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
- *weight += rt->fib6_nh.fib_nh_weight;
+ *weight += rt->fib6_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
- atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -4028,9 +4545,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
const struct arg_netdev_event *arg = p_arg;
struct net *net = dev_net(arg->dev);
- if (rt != net->ipv6.fib6_null_entry &&
- rt->fib6_nh.fib_nh_dev == arg->dev) {
- rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@@ -4053,15 +4570,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}
+/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
const struct net_device *dev)
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
+ if (rt->fib6_nh->fib_nh_dev == dev)
return true;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
+ if (iter->fib6_nh->fib_nh_dev == dev)
return true;
return false;
@@ -4082,12 +4600,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
struct fib6_info *iter;
unsigned int dead = 0;
- if (rt->fib6_nh.fib_nh_dev == down_dev ||
- rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == down_dev ||
- iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
return dead;
@@ -4099,11 +4617,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
- rt->fib6_nh.fib_nh_flags |= nh_flags;
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
- iter->fib6_nh.fib_nh_flags |= nh_flags;
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
@@ -4113,17 +4631,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
const struct net_device *dev = arg->dev;
struct net *net = dev_net(dev);
- if (rt == net->ipv6.fib6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
return 0;
switch (arg->event) {
case NETDEV_UNREGISTER:
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
if (!rt->fib6_nsiblings)
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
@@ -4139,10 +4657,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
}
return -2;
case NETDEV_CHANGE:
- if (rt->fib6_nh.fib_nh_dev != dev ||
+ if (rt->fib6_nh->fib_nh_dev != dev ||
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
- rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@@ -4176,9 +4694,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event)
struct rt6_mtu_change_arg {
struct net_device *dev;
unsigned int mtu;
+ struct fib6_info *f6i;
};
-static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
+ struct fib6_info *f6i = arg->f6i;
+
+ /* For administrative MTU increase, there is no way to discover
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
+ * Since RFC 1981 doesn't include administrative MTU increase
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ if (nh->fib_nh_dev == arg->dev) {
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
+ u32 mtu = f6i->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
+
+ spin_lock_bh(&rt6_exception_lock);
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
+ }
+
+ return 0;
+}
+
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -4193,24 +4738,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
if (!idev)
return 0;
- /* For administrative MTU increase, there is no way to discover
- IPv6 PMTU increase, so PMTU increase should be updated here.
- Since RFC 1981 doesn't include administrative MTU increase
- update PMTU increase is a MUST. (i.e. jumbo frame)
- */
- if (rt->fib6_nh.fib_nh_dev == arg->dev &&
- !fib6_metric_locked(rt, RTAX_MTU)) {
- u32 mtu = rt->fib6_pmtu;
-
- if (mtu >= arg->mtu ||
- (mtu < arg->mtu && mtu == idev->cnf.mtu6))
- fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+ if (fib6_metric_locked(f6i, RTAX_MTU))
+ return 0;
- spin_lock_bh(&rt6_exception_lock);
- rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
- spin_unlock_bh(&rt6_exception_lock);
+ arg->f6i = f6i;
+ if (f6i->nh) {
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
+ arg);
}
- return 0;
+
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@@ -4224,6 +4762,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
}
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
[RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
@@ -4241,6 +4780,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4287,6 +4827,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+ if (tb[RTA_NH_ID]) {
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ goto errout;
+ }
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+ }
+
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
@@ -4430,6 +4980,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
+ enum fib_event_type event_type;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
struct fib6_info *rt;
@@ -4489,7 +5040,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
- rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
rt, &r_cfg);
@@ -4501,12 +5052,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = rtnh_next(rtnh, &remaining);
}
+ if (list_empty(&rt6_nh_list)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
* the full route when done
*/
info->skip_notify = 1;
+ /* For add and replace, send one notification with all nexthops. For
+ * append, send one notification with all appended nexthops.
+ */
+ info->skip_notify_kernel = 1;
+
err_nh = NULL;
list_for_each_entry(nh, &rt6_nh_list, next) {
err = __ip6_ins_rt(nh->fib6_info, info, extack);
@@ -4543,6 +5105,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nhn++;
}
+ event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD;
+ err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type,
+ rt_notif, nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
+
/* success ... tell user about new route */
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
goto cleanup;
@@ -4621,6 +5192,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ if (cfg.fc_nh_id &&
+ !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
+
if (cfg.fc_mp)
return ip6_route_multipath_del(&cfg, extack);
else {
@@ -4648,17 +5225,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static size_t rt6_nlmsg_size(struct fib6_info *rt)
+/* add the overhead of this fib6_nh to nexthop_len */
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
+{
+ int *nexthop_len = arg;
+
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16); /* RTA_GATEWAY */
+
+ if (nh->fib_nh_lws) {
+ /* RTA_ENCAP_TYPE */
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+ /* RTA_ENCAP */
+ *nexthop_len += nla_total_size(2);
+ }
+
+ return 0;
+}
+
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
{
- int nexthop_len = 0;
+ int nexthop_len;
+
+ if (f6i->nh) {
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
+ &nexthop_len);
+ } else {
+ struct fib6_nh *nh = f6i->fib6_nh;
- if (rt->fib6_nsiblings) {
- nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
- + NLA_ALIGN(sizeof(struct rtnexthop))
- + nla_total_size(16) /* RTA_GATEWAY */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
+ nexthop_len = 0;
+ if (f6i->fib6_nsiblings) {
+ nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16) /* RTA_GATEWAY */
+ + lwtunnel_get_encap_size(nh->fib_nh_lws);
- nexthop_len *= rt->fib6_nsiblings;
+ nexthop_len *= f6i->fib6_nsiblings;
+ }
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4674,10 +5280,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
+ nexthop_len;
}
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+ unsigned char *flags)
+{
+ if (nexthop_is_multipath(nh)) {
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (nexthop_mpath_fill_node(skb, nh))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, mp);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = nexthop_fib6_nh(nh);
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+ flags, false) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -4687,6 +5321,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct rt6_info *rt6 = (struct rt6_info *)dst;
struct rt6key *rt6_dst, *rt6_src;
u32 *pmetrics, table, rt6_flags;
+ unsigned char nh_flags = 0;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
long expires = 0;
@@ -4794,22 +5429,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (!mp)
goto nla_put_failure;
- if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
- rt->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+ rt->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
- if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
- sibling->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+ sibling->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, mp);
- } else {
- unsigned char nh_flags = 0;
+ } else if (rt->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+ goto nla_put_failure;
+
+ if (nexthop_is_blackhole(rt->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+
+ if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ goto nla_put_failure;
- if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
+ rtm->rtm_flags |= nh_flags;
+ } else {
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
&nh_flags, false) < 0)
goto nla_put_failure;
@@ -4836,10 +5480,28 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ if (nh->fib_nh_dev == dev)
+ return 1;
+
+ return 0;
+}
+
static bool fib6_info_uses_dev(const struct fib6_info *f6i,
const struct net_device *dev)
{
- if (f6i->fib6_nh.fib_nh_dev == dev)
+ if (f6i->nh) {
+ struct net_device *_dev = (struct net_device *)dev;
+
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
+ fib6_info_nh_uses_dev,
+ _dev);
+ }
+
+ if (f6i->fib6_nh->fib_nh_dev == dev)
return true;
if (f6i->fib6_nsiblings) {
@@ -4847,7 +5509,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
list_for_each_entry_safe(sibling, next_sibling,
&f6i->fib6_siblings, fib6_siblings) {
- if (sibling->fib6_nh.fib_nh_dev == dev)
+ if (sibling->fib6_nh->fib_nh_dev == dev)
return true;
}
}
@@ -4855,33 +5517,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
return false;
}
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
+struct fib6_nh_exception_dump_walker {
+ struct rt6_rtnl_dump_arg *dump;
+ struct fib6_info *rt;
+ unsigned int flags;
+ unsigned int skip;
+ unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_nh_exception_dump_walker *w = arg;
+ struct rt6_rtnl_dump_arg *dump = w->dump;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i, err;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+ if (!bucket)
+ return 0;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (w->skip) {
+ w->skip--;
+ continue;
+ }
+
+ /* Expiration of entries doesn't bump sernum, insertion
+ * does. Removal is triggered by insertion, so we can
+ * rely on the fact that if entries change between two
+ * partial dumps, this node is scanned again completely,
+ * see rt6_insert_exception() and fib6_dump_table().
+ *
+ * Count expired entries we go through as handled
+ * entries that we'll skip next time, in case of partial
+ * node dump. Otherwise, if entries expire meanwhile,
+ * we'll skip the wrong amount.
+ */
+ if (rt6_check_expired(rt6_ex->rt6i)) {
+ w->count++;
+ continue;
+ }
+
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
+ RTM_NEWROUTE,
+ NETLINK_CB(dump->cb->skb).portid,
+ dump->cb->nlh->nlmsg_seq, w->flags);
+ if (err)
+ return err;
+
+ w->count++;
+ }
+ bucket++;
+ }
+
+ return 0;
+}
+
+/* Return -1 if done with node, number of handled routes on partial dump */
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
struct fib_dump_filter *filter = &arg->filter;
unsigned int flags = NLM_F_MULTI;
struct net *net = arg->net;
+ int count = 0;
if (rt == net->ipv6.fib6_null_entry)
- return 0;
+ return -1;
if ((filter->flags & RTM_F_PREFIX) &&
!(rt->fib6_flags & RTF_PREFIX_RT)) {
/* success since this is not a prefix route */
- return 1;
+ return -1;
}
- if (filter->filter_set) {
- if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
- (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
- (filter->protocol && rt->fib6_protocol != filter->protocol)) {
- return 1;
- }
+ if (filter->filter_set &&
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+ return -1;
+ }
+
+ if (filter->filter_set ||
+ !filter->dump_routes || !filter->dump_exceptions) {
flags |= NLM_F_DUMP_FILTERED;
}
- return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
- RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
- arg->cb->nlh->nlmsg_seq, flags);
+ if (filter->dump_routes) {
+ if (skip) {
+ skip--;
+ } else {
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+ 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, flags)) {
+ return 0;
+ }
+ count++;
+ }
+ }
+
+ if (filter->dump_exceptions) {
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
+ .rt = rt,
+ .flags = flags,
+ .skip = skip,
+ .count = 0 };
+ int err;
+
+ rcu_read_lock();
+ if (rt->nh) {
+ err = nexthop_for_each_fib6_nh(rt->nh,
+ rt6_nh_dump_exceptions,
+ &w);
+ } else {
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return count += w.count;
+ }
+
+ return -1;
}
static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
@@ -5126,6 +5886,38 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ /* call_fib6_entry_notifiers will be removed when in-kernel notifier
+ * is implemented and supported for nexthop objects
+ */
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
+
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
static int ip6_route_dev_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -5136,7 +5928,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
- net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5330,11 +6122,11 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
- net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
- sizeof(*net->ipv6.fib6_null_entry),
- GFP_KERNEL);
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
if (!net->ipv6.fib6_null_entry)
goto out_ip6_dst_entries;
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry));
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
@@ -5344,6 +6136,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
@@ -5355,6 +6148,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -5364,6 +6158,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
#endif
net->ipv6.sysctl.flush_delay = 0;
@@ -5471,7 +6266,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
- init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..6d86fac472e7 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -23,6 +23,7 @@
static int zero;
static int one = 1;
+static int three = 3;
static int auto_flowlabels_min;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -114,6 +115,8 @@ static struct ctl_table ipv6_table_template[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &three,
},
{
.procname = "max_dst_opts_number",
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7a14ea37d2df..408d9ec26971 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif;
}
- if (sk)
- mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ mark = inet_twsk(sk)->tw_mark;
+ /* autoflowlabel relies on buff->hash */
+ skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
+ PKT_HASH_TYPE_L4);
+ } else {
+ mark = sk->sk_mark;
+ }
+ buff->tstamp = tcp_transmit_time(sk);
+ }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
@@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u32 seq = 0, ack_seq = 0;
struct tcp_md5sig_key *key = NULL;
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
unsigned char newhash[16];
int genhash;
struct sock *sk1 = NULL;
#endif
+ __be32 label = 0;
+ struct net *net;
int oif = 0;
if (th->rst)
@@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (!sk && !ipv6_unicast_destination(skb))
return;
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
@@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
+ sk1 = inet6_lookup_listener(net,
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
@@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk))
trace_tcp_send_reset(sk, skb);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ } else {
+ if (net->ipv6.sysctl.flowlabel_reflect & 2)
+ label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
+ label);
#ifdef CONFIG_TCP_MD5SIG
out:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 70b01bd95022..66ca5a4b17c4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,16 +54,6 @@
#include <trace/events/skb.h>
#include "udp_impl.h"
-static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
-{
-#if defined(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
- return true;
-#endif
- return false;
-}
-
static u32 udp6_ehashfn(const struct net *net,
const struct in6_addr *laddr,
const u16 lport,
@@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk)
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
@@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net,
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
- struct udp_hslot *hslot2, struct sk_buff *skb)
+ int dif, int sdif, struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness;
@@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ daddr, hnum, dif, sdif);
if (score > badness) {
if (sk->sk_reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
@@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
struct sock *result;
- bool exact_dif = udp6_lib_exact_dif_match(net, skb);
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif,
+ daddr, hnum, dif, sdif,
hslot2, skb);
if (!result) {
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
@@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
&in6addr_any, hnum, dif, sdif,
- exact_dif, hslot2,
- skb);
+ hslot2, skb);
}
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/key/af_key.c b/net/key/af_key.c
index a50dd6f34b91..39b3d95094eb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -928,8 +928,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->props.saddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
/* dst address */
addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size);
@@ -944,8 +943,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->id.daddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr,
x->props.family)) {
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 6e2b4b9267e1..35bb4f3bdbe0 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -31,7 +31,6 @@
#include "l2tp_core.h"
static struct dentry *rootdir;
-static struct dentry *tunnels;
struct l2tp_dfs_seq_data {
struct net *net;
@@ -326,32 +325,18 @@ static const struct file_operations l2tp_dfs_fops = {
static int __init l2tp_debugfs_init(void)
{
- int rc = 0;
-
rootdir = debugfs_create_dir("l2tp", NULL);
- if (IS_ERR(rootdir)) {
- rc = PTR_ERR(rootdir);
- rootdir = NULL;
- goto out;
- }
- tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
- if (tunnels == NULL)
- rc = -EIO;
+ debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
pr_info("L2TP debugfs support\n");
-out:
- if (rc)
- pr_warn("unable to init\n");
-
- return rc;
+ return 0;
}
static void __exit l2tp_debugfs_exit(void)
{
- debugfs_remove(tunnels);
- debugfs_remove(rootdir);
+ debugfs_remove_recursive(rootdir);
}
module_init(l2tp_debugfs_init);
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index cfc9fcb97465..f35899d45a9a 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
* local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
+ * This function does not hold refcnt on the returned dst.
+ * Caller must hold rcu_read_lock().
*/
struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
@@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
struct dst_entry *dst = NULL;
struct net_device *dev;
+ WARN_ON_ONCE(!rcu_read_lock_held());
if (fl6->flowi6_oif) {
- rcu_read_lock();
-
dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (dev && netif_is_l3_slave(dev))
dev = netdev_master_upper_dev_get_rcu(dev);
@@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_link_scope_lookup)
dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
-
- rcu_read_unlock();
}
return dst;
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index 5d2d1f746b91..3c03f6512c5f 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -68,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb)
lapb_put(lapb);
}
}
-EXPORT_SYMBOL(lapb_register);
/*
* Add a socket to the bound sockets list.
@@ -115,7 +114,6 @@ static struct lapb_cb *lapb_create_cb(void)
{
struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC);
-
if (!lapb)
goto out;
@@ -167,6 +165,7 @@ out:
write_unlock_bh(&lapb_list_lock);
return rc;
}
+EXPORT_SYMBOL(lapb_register);
int lapb_unregister(struct net_device *dev)
{
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index a1973a26c7fc..4f12d042c89c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -5,6 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018-2019 Intel Corporation
* Copyright (C) 2018 Intel Corporation
*/
@@ -974,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
BSS_CHANGED_BEACON |
BSS_CHANGED_SSID |
BSS_CHANGED_P2P_PS |
- BSS_CHANGED_TXPOWER;
+ BSS_CHANGED_TXPOWER |
+ BSS_CHANGED_TWT;
int err;
int prev_beacon_int;
@@ -1044,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.dtim_period = params->dtim_period;
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
+ sdata->vif.bss_conf.twt_responder = params->twt_responder;
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1465,7 +1468,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
return ret;
}
- if (params->supported_rates) {
+ if (params->supported_rates && params->supported_rates_len) {
ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
sband, params->supported_rates,
params->supported_rates_len,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 271bc2b676a4..2e7f75938c51 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -272,6 +272,7 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_MULTI_BSSID),
FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
FLAG(EXT_KEY_ID_NATIVE),
+ FLAG(NO_AMPDU_KEYBORDER_SUPPORT),
#undef FLAG
};
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 3509ce0daea3..7b8735ced2a1 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -339,9 +339,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
key->debugfs.dir = debugfs_create_dir(buf,
key->local->debugfs.keys);
- if (!key->debugfs.dir)
- return;
-
sta = key->sta;
if (sta) {
sprintf(buf, "../../netdev:%s/stations/%pM",
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index f1f2e1c7ac0c..b1438fd4d876 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -815,9 +815,8 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
sprintf(buf, "netdev:%s", sdata->name);
sdata->vif.debugfs_dir = debugfs_create_dir(buf,
sdata->local->hw.wiphy->debugfsdir);
- if (sdata->vif.debugfs_dir)
- sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
- sdata->vif.debugfs_dir);
+ sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+ sdata->vif.debugfs_dir);
add_files(sdata);
}
@@ -842,8 +841,5 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
return;
sprintf(buf, "netdev:%s", sdata->name);
- if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
- sdata_err(sdata,
- "debugfs: failed to rename debugfs dir to %s\n",
- buf);
+ debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 3fd79ccb293b..c8ad20c28c43 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -957,8 +957,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
* dir might still be around.
*/
sta->debugfs_dir = debugfs_create_dir(mac, stations_dir);
- if (!sta->debugfs_dir)
- return;
DEBUGFS_ADD(flags);
DEBUGFS_ADD(aid);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 157ff5f890d2..dd60f6428049 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -269,50 +269,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key)
assert_key_lock(local);
sta->ptk_idx = key->conf.keyidx;
+
+ if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT))
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_check_fast_xmit(sta);
return 0;
}
-static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
- struct ieee80211_key *new_key,
- bool pairwise)
+static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
+ struct ieee80211_key *new)
{
- struct ieee80211_sub_if_data *sdata;
- struct ieee80211_local *local;
- struct sta_info *sta;
- int ret;
-
- /* Aggregation sessions are OK when running on SW crypto.
- * A broken remote STA may cause issues not observed with HW
- * crypto, though.
- */
- if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
- return 0;
+ struct ieee80211_local *local = new->local;
+ struct sta_info *sta = new->sta;
+ int i;
- assert_key_lock(old_key->local);
- sta = old_key->sta;
+ assert_key_lock(local);
- /* Unicast rekey without Extended Key ID needs special handling */
- if (new_key && sta && pairwise &&
- rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) {
- local = old_key->local;
- sdata = old_key->sdata;
+ if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) {
+ /* Extended Key ID key install, initial one or rekey */
+
+ if (sta->ptk_idx != INVALID_PTK_KEYIDX &&
+ ieee80211_hw_check(&local->hw,
+ NO_AMPDU_KEYBORDER_SUPPORT)) {
+ /* Aggregation Sessions with Extended Key ID must not
+ * mix MPDUs with different keyIDs within one A-MPDU.
+ * Tear down any running Tx aggregation and all new
+ * Rx/Tx aggregation request during rekey if the driver
+ * asks us to do so. (Blocking Tx only would be
+ * sufficient but WLAN_STA_BLOCK_BA gets the job done
+ * for the few ms we need it.)
+ */
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ ___ieee80211_stop_tx_ba_session(sta, i,
+ AGG_STOP_LOCAL_REQUEST);
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+ }
+ } else if (old) {
+ /* Rekey without Extended Key ID.
+ * Aggregation sessions are OK when running on SW crypto.
+ * A broken remote STA may cause issues not observed with HW
+ * crypto, though.
+ */
+ if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ return;
- /* Stop TX till we are on the new key */
- old_key->flags |= KEY_FLAG_TAINTED;
+ /* Stop Tx till we are on the new key */
+ old->flags |= KEY_FLAG_TAINTED;
ieee80211_clear_fast_xmit(sta);
-
- /* Aggregation sessions during rekey are complicated due to the
- * reorder buffer and retransmits. Side step that by blocking
- * aggregation during rekey and tear down running sessions.
- */
if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_sta_tear_down_BA_sessions(sta,
AGG_STOP_LOCAL_REQUEST);
}
-
if (!wiphy_ext_feature_isset(local->hw.wiphy,
NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
@@ -320,18 +331,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
/* Flushing the driver queues *may* help prevent
* the clear text leaks and freezes.
*/
- ieee80211_flush_queues(local, sdata, false);
+ ieee80211_flush_queues(local, old->sdata, false);
}
}
-
- ieee80211_key_disable_hw_accel(old_key);
-
- if (new_key)
- ret = ieee80211_key_enable_hw_accel(new_key);
- else
- ret = 0;
-
- return ret;
}
static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -389,7 +391,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&sdata->local->key_mtx);
}
-
static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
bool pairwise,
@@ -397,7 +398,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct ieee80211_key *new)
{
int idx;
- int ret;
+ int ret = 0;
bool defunikey, defmultikey, defmgmtkey;
/* caller must provide at least one old/new */
@@ -409,16 +410,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
+ if (new && sta && pairwise) {
+ /* Unicast rekey needs special handling. With Extended Key ID
+ * old is still NULL for the first rekey.
+ */
+ ieee80211_pairwise_rekey(old, new);
+ }
+
if (old) {
idx = old->conf.keyidx;
- ret = ieee80211_hw_key_replace(old, new, pairwise);
+
+ if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+ ieee80211_key_disable_hw_accel(old);
+
+ if (new)
+ ret = ieee80211_key_enable_hw_accel(new);
+ }
} else {
/* new must be provided in case old is not */
idx = new->conf.keyidx;
if (!new->local->wowlan)
ret = ieee80211_key_enable_hw_accel(new);
- else
- ret = 0;
}
if (ret)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 55583b71ffaf..85e416248753 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -351,11 +351,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
sdata_lock(sdata);
/* Copy the addresses to the bss_conf list */
- ifa = idev->ifa_list;
+ ifa = rtnl_dereference(idev->ifa_list);
while (ifa) {
if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
bss_conf->arp_addr_list[c] = ifa->ifa_address;
- ifa = ifa->ifa_next;
+ ifa = rtnl_dereference(ifa->ifa_next);
c++;
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 379d2ab6d327..96014f459a2b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3155,6 +3155,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta,
IEEE80211_HE_MAC_CAP0_TWT_RES;
}
+static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee802_11_elems *elems)
+{
+ bool twt = ieee80211_twt_req_supported(sta, elems);
+
+ if (sdata->vif.bss_conf.twt_requester != twt) {
+ sdata->vif.bss_conf.twt_requester = twt;
+ return BSS_CHANGED_TWT;
+ }
+ return 0;
+}
+
static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss,
struct ieee80211_mgmt *mgmt, size_t len)
@@ -3337,8 +3350,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta);
bss_conf->he_support = sta->sta.he_cap.has_he;
- bss_conf->twt_requester =
- ieee80211_twt_req_supported(sta, &elems);
+ changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
} else {
bss_conf->he_support = false;
bss_conf->twt_requester = false;
@@ -3998,6 +4010,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, bssid);
+ changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
+
if (ieee80211_config_bw(sdata, sta,
elems.ht_cap_elem, elems.ht_operation,
elems.vht_operation, elems.he_operation,
@@ -4948,7 +4962,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
basic_rates = BIT(min_rate_index);
}
- new_sta->sta.supp_rates[cbss->channel->band] = rates;
+ if (rates)
+ new_sta->sta.supp_rates[cbss->channel->band] = rates;
+ else
+ sdata_info(sdata,
+ "No rates found, keeping mandatory only\n");
+
sdata->vif.bss_conf.basic_rates = basic_rates;
/* cf. IEEE 802.11 9.2.12 */
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 6e5961d7f639..60ef8972b254 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -199,6 +199,10 @@ static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc)
cfg80211_remain_on_channel_expired(&roc->sdata->wdev,
roc->cookie, roc->chan,
GFP_KERNEL);
+ else
+ cfg80211_tx_mgmt_expired(&roc->sdata->wdev,
+ roc->mgmt_tx_cookie,
+ roc->chan, GFP_KERNEL);
list_del(&roc->list);
kfree(roc);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 47ee36677c2b..a1e9fc7878aa 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -354,8 +354,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
break;
}
WARN_ONCE(i == sband->n_bitrates,
- "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n",
+ "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n",
+ sta ? sta->addr : NULL,
sta ? sta->supp_rates[sband->band] : -1,
+ sband->band,
rate_mask, rate_flags);
info->control.rates[0].count =
@@ -366,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
}
-bool rate_control_send_low(struct ieee80211_sta *pubsta,
- void *priv_sta,
- struct ieee80211_tx_rate_control *txrc)
+static bool rate_control_send_low(struct ieee80211_sta *pubsta,
+ struct ieee80211_tx_rate_control *txrc)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
struct ieee80211_supported_band *sband = txrc->sband;
@@ -376,7 +377,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
int mcast_rate;
bool use_basicrate = false;
- if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
+ if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) {
__rate_control_send_low(txrc->hw, sband, pubsta, info,
txrc->rate_idx_mask);
@@ -402,7 +403,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
}
return false;
}
-EXPORT_SYMBOL(rate_control_send_low);
static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask)
{
@@ -885,26 +885,29 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
int i;
- if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) {
- ista = &sta->sta;
- priv_sta = sta->rate_ctrl_priv;
- }
-
for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
info->control.rates[i].idx = -1;
info->control.rates[i].flags = 0;
info->control.rates[i].count = 0;
}
+ if (rate_control_send_low(sta ? &sta->sta : NULL, txrc))
+ return;
+
if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL))
return;
+ if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) {
+ ista = &sta->sta;
+ priv_sta = sta->rate_ctrl_priv;
+ }
+
if (ista) {
spin_lock_bh(&sta->rate_ctrl_lock);
ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
spin_unlock_bh(&sta->rate_ctrl_lock);
} else {
- ref->ops->get_rate(ref->priv, NULL, NULL, txrc);
+ rate_control_send_low(NULL, txrc);
}
if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE))
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index a34e9c2ca626..ee86c3333999 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
int delta;
int sampling_ratio;
- /* management/no-ack frames do not use rate control */
- if (rate_control_send_low(sta, priv_sta, txrc))
- return;
-
/* check multi-rate-retry capabilities & adjust lookaround_rate */
mrr_capable = mp->has_mrr &&
!txrc->rts &&
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 298a1acb3ce5..5a882da82f0e 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1093,9 +1093,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
struct minstrel_priv *mp = priv;
int sample_idx;
- if (rate_control_send_low(sta, priv_sta, txrc))
- return;
-
if (!msp->is_ht)
return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 187f62a48b2b..95eb8220e2e4 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2019 Intel Corporation
*/
#include <linux/module.h>
@@ -401,6 +401,47 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
for (i = 0; i < IEEE80211_NUM_TIDS; i++)
sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ u32 mandatory = 0;
+ int r;
+
+ if (!hw->wiphy->bands[i])
+ continue;
+
+ switch (i) {
+ case NL80211_BAND_2GHZ:
+ /*
+ * We use both here, even if we cannot really know for
+ * sure the station will support both, but the only use
+ * for this is when we don't know anything yet and send
+ * management frames, and then we'll pick the lowest
+ * possible rate anyway.
+ * If we don't include _G here, we cannot find a rate
+ * in P2P, and thus trigger the WARN_ONCE() in rate.c
+ */
+ mandatory = IEEE80211_RATE_MANDATORY_B |
+ IEEE80211_RATE_MANDATORY_G;
+ break;
+ case NL80211_BAND_5GHZ:
+ mandatory = IEEE80211_RATE_MANDATORY_A;
+ break;
+ case NL80211_BAND_60GHZ:
+ WARN_ON(1);
+ mandatory = 0;
+ break;
+ }
+
+ for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) {
+ struct ieee80211_rate *rate;
+
+ rate = &hw->wiphy->bands[i]->bitrates[r];
+
+ if (!(rate->flags & mandatory))
+ continue;
+ sta->sta.supp_rates[i] |= BIT(r);
+ }
+ }
+
sta->sta.smps_mode = IEEE80211_SMPS_OFF;
if (sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b96fd3f54705..817a9e5d16e4 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
}
EXPORT_SYMBOL(nf_hook_slow);
-
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
-{
- if (writable_len > skb->len)
- return 0;
-
- /* Not exclusive use of packet? Must copy. */
- if (!skb_cloned(skb)) {
- if (writable_len <= skb_headlen(skb))
- return 1;
- } else if (skb_clone_writable(skb, writable_len))
- return 1;
-
- if (writable_len <= skb_headlen(skb))
- writable_len = 0;
- else
- writable_len -= skb_headlen(skb);
-
- return !!__pskb_pull_tail(skb, writable_len);
-}
-EXPORT_SYMBOL(skb_make_writable);
-
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 8acc4e173167..063df74b4647 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef __IP_SET_BITMAP_IP_GEN_H
#define __IP_SET_BITMAP_IP_GEN_H
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index e3884b0cca91..11ff9d4a7006 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module implementing an IP set type: the bitmap:ip type */
@@ -28,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index b73c37b3a791..ca7ac4a25ada 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -2,7 +2,6 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*/
/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
@@ -28,7 +27,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index d8c140553379..704a0dda1609 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the bitmap:port type */
@@ -23,7 +22,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:port");
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 3cdf171cd468..2e151856ad99 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module for IP set management */
@@ -48,7 +48,7 @@ static unsigned int max_sets;
module_param(max_sets, int, 0600);
MODULE_PARM_DESC(max_sets, "maximal number of sets");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("core IP set support");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
@@ -1290,11 +1290,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
+ int ret;
- /* Second pass, so parser can't fail */
- nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr,
- nlh->nlmsg_len - min_len, ip_set_setname_policy,
- NULL);
+ ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr,
+ nlh->nlmsg_len - min_len,
+ ip_set_setname_policy, NULL);
+ if (ret)
+ return ret;
cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
if (cda[IPSET_ATTR_SETNAME]) {
@@ -1541,10 +1543,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
cmdattr = (void *)&errmsg->msg + min_len;
- nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr,
- nlh->nlmsg_len - min_len,
- ip_set_adt_policy, NULL);
+ ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr,
+ nlh->nlmsg_len - min_len,
+ ip_set_adt_policy, NULL);
+ if (ret) {
+ nlmsg_free(skb2);
+ return ret;
+ }
errline = nla_data(cda[IPSET_ATTR_LINENO]);
*errline = lineno;
@@ -1558,10 +1564,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
return ret;
}
-static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_ad(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ enum ipset_adt adt,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
{
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
@@ -1590,18 +1598,17 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (attr[IPSET_ATTR_DATA]) {
if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ ret = call_ad(ctnl, skb, set, tb, adt, flags,
use_lineno);
} else {
int nla_rem;
nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(tb));
if (nla_type(nla) != IPSET_ATTR_DATA ||
!flag_nested(nla) ||
nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ ret = call_ad(ctnl, skb, set, tb, adt,
flags, use_lineno);
if (ret < 0)
return ret;
@@ -1610,56 +1617,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int ip_set_uadd(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const attr[],
struct netlink_ext_ack *extack)
{
- struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
- const struct nlattr *nla;
- u32 flags = flag_exist(nlh);
- bool use_lineno;
- int ret = 0;
-
- if (unlikely(protocol_min_failed(attr) ||
- !attr[IPSET_ATTR_SETNAME] ||
- !((attr[IPSET_ATTR_DATA] != NULL) ^
- (attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] &&
- !flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] &&
- (!flag_nested(attr[IPSET_ATTR_ADT]) ||
- !attr[IPSET_ATTR_LINENO]))))
- return -IPSET_ERR_PROTOCOL;
-
- set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (!set)
- return -ENOENT;
-
- use_lineno = !!attr[IPSET_ATTR_LINENO];
- if (attr[IPSET_ATTR_DATA]) {
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
- use_lineno);
- } else {
- int nla_rem;
+ return ip_set_ad(net, ctnl, skb,
+ IPSET_ADD, nlh, attr, extack);
+}
- nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(*tb));
- if (nla_type(nla) != IPSET_ATTR_DATA ||
- !flag_nested(nla) ||
- nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
- flags, use_lineno);
- if (ret < 0)
- return ret;
- }
- }
- return ret;
+static int ip_set_udel(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
+{
+ return ip_set_ad(net, ctnl, skb,
+ IPSET_DEL, nlh, attr, extack);
}
static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 2384e36aef5c..2b8f959574b4 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -1,5 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
*/
/* Get Layer-4 data from the packets */
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 10f619625abd..0feb77fa9edc 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef _IP_SET_HASH_GEN_H
#define _IP_SET_HASH_GEN_H
@@ -622,7 +621,7 @@ retry:
goto cleanup;
}
m->size = AHASH_INIT_SIZE;
- extsize = ext_size(AHASH_INIT_SIZE, dsize);
+ extsize += ext_size(AHASH_INIT_SIZE, dsize);
RCU_INIT_POINTER(hbucket(t, key), m);
} else if (m->pos >= m->size) {
struct hbucket *ht;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 69d7576be2e6..f4432d9fcad0 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip type */
@@ -27,7 +26,7 @@
#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip");
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index 6fe1ec0d2154..7a1734aad0c5 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,mark type */
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 74ec7e097e34..32e240658334 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index ced57d63b01f..15d419353179 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,ip");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 905f6cf0f55e..7a4d7afd4121 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
@@ -31,7 +30,7 @@
#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,net");
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 853e772ab4d9..d94c585d33c5 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:mac type */
@@ -20,7 +19,7 @@
#define IPSET_TYPE_REV_MAX 0
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:mac");
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 06c91e49bf25..c259cbc3ef45 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net type */
@@ -28,7 +27,7 @@
#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net");
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 0a8cbcdfb42b..87b29f971226 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,iface type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,iface");
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 832e4f5491cb..a3ae69bfee66 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
*/
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index a4f3f15b874a..799f2272cc65 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,port type */
@@ -30,7 +29,7 @@
#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,port");
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index e54d415405f3..a82b70e8b9a6 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 8ada318bf09d..6f9ead6319e0 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the list:set type */
@@ -19,7 +18,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_list:set");
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index bfd4365a8d73..4515056ef1c2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -358,7 +358,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
@@ -435,7 +435,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7138556b206b..e8651fd621ef 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -34,6 +34,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
+#include <net/gue.h>
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h> /* net_generic() */
@@ -892,7 +893,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
IPPROTO_SCTP == protocol)
offset += 2 * sizeof(__u16);
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto out;
#ifdef CONFIG_IP_VS_IPV6
@@ -1282,7 +1283,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
- if (!skb_make_writable(skb, iph->len))
+ if (skb_ensure_writable(skb, iph->len))
goto drop;
/* mangle the packet */
@@ -1574,6 +1575,41 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
return 1;
}
+/* Check the UDP tunnel and return its header length */
+static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ unsigned int offset, __u16 af,
+ const union nf_inet_addr *daddr, __u8 *proto)
+{
+ struct udphdr _udph, *udph;
+ struct ip_vs_dest *dest;
+
+ udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!udph)
+ goto unk;
+ offset += sizeof(struct udphdr);
+ dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
+ if (!dest)
+ goto unk;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ struct guehdr _gueh, *gueh;
+
+ gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
+ if (!gueh)
+ goto unk;
+ if (gueh->control != 0 || gueh->version != 0)
+ goto unk;
+ /* Later we can support also IPPROTO_IPV6 */
+ if (gueh->proto_ctype != IPPROTO_IPIP)
+ goto unk;
+ *proto = gueh->proto_ctype;
+ return sizeof(struct udphdr) + sizeof(struct guehdr) +
+ (gueh->hlen << 2);
+ }
+
+unk:
+ return 0;
+}
+
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
@@ -1593,6 +1629,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
bool ipip, new_cp = false;
+ union nf_inet_addr *raddr;
*related = 1;
@@ -1631,20 +1668,51 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
+ raddr = (union nf_inet_addr *)&cih->daddr;
/* Special case for errors for IPIP packets */
ipip = false;
if (cih->protocol == IPPROTO_IPIP) {
+ struct ip_vs_dest *dest;
+
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
/* Error for our IPIP must arrive at LOCAL_IN */
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
return NF_ACCEPT;
+ dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
+ /* Only for known tunnel */
+ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
+ return NF_ACCEPT;
offset += cih->ihl * 4;
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
ipip = true;
+ } else if (cih->protocol == IPPROTO_UDP && /* Can be UDP encap */
+ /* Error for our tunnel must arrive at LOCAL_IN */
+ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
+ __u8 iproto;
+ int ulen;
+
+ /* Non-first fragment has no UDP header */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ offset2 = offset + cih->ihl * 4;
+ ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr,
+ &iproto);
+ if (ulen > 0) {
+ /* Skip IP and UDP tunnel headers */
+ offset = offset2 + ulen;
+ /* Now we should be at the original IP header */
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph),
+ &_ciph);
+ if (cih && cih->version == 4 && cih->ihl >= 5 &&
+ iproto == IPPROTO_IPIP)
+ ipip = true;
+ else
+ return NF_ACCEPT;
+ }
}
pd = ip_vs_proto_data_get(ipvs, cih->protocol);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 776c87ed4813..84384d896e29 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -510,15 +510,36 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
+ __be16 port;
if (dest->in_rs_table)
return;
+ switch (IP_VS_DFWD_METHOD(dest)) {
+ case IP_VS_CONN_F_MASQ:
+ port = dest->port;
+ break;
+ case IP_VS_CONN_F_TUNNEL:
+ switch (dest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ port = dest->tun_port;
+ break;
+ case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
+ port = 0;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
- hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
dest->in_rs_table = 1;
@@ -550,7 +571,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
/* HIT */
return true;
}
@@ -580,7 +602,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find real service record by <af,addr,tun_port>.
+ * In case of multiple records with the same <af,addr,tun_port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
+ const union nf_inet_addr *daddr,
+ __be16 tun_port)
+{
+ struct ip_vs_dest *dest;
+ unsigned int hash;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, tun_port);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->tun_port == tun_port &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
/* HIT */
return dest;
}
@@ -826,24 +878,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
+ /* Need to rehash? */
+ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
+ IP_VS_DFWD_METHOD(dest) ||
+ udest->tun_type != dest->tun_type ||
+ udest->tun_port != dest->tun_port)
+ ip_vs_rs_unhash(dest);
+
/* set the tunnel info */
dest->tun_type = udest->tun_type;
dest->tun_port = udest->tun_port;
+ dest->tun_flags = udest->tun_flags;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
- /*
- * Put the real service in rs_table if not present.
- * For now only for NAT!
- */
- ip_vs_rs_hash(ipvs, dest);
/* FTP-NAT requires conntrack for mangling */
if (svc->port == FTPPORT)
ip_vs_register_conntrack(svc);
}
atomic_set(&dest->conn_flags, conn_flags);
+ /* Put the real service in rs_table if not present. */
+ ip_vs_rs_hash(ipvs, dest);
/* bind the service */
old_svc = rcu_dereference_protected(dest->svc, 1);
@@ -2906,6 +2963,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
[IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
@@ -3212,6 +3270,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
dest->tun_type) ||
nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
dest->tun_port) ||
+ nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
+ dest->tun_flags) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3332,7 +3392,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
- *nla_l_thresh, *nla_tun_type, *nla_tun_port;
+ *nla_l_thresh, *nla_tun_type, *nla_tun_port,
+ *nla_tun_flags;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
@@ -3340,6 +3401,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
+ nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
@@ -3355,6 +3417,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
if (nla_tun_port)
udest->tun_port = nla_get_be16(nla_tun_port);
+
+ if (nla_tun_flags)
+ udest->tun_flags = nla_get_u16(nla_tun_flags);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index c244b2545e24..cf925906f59b 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -267,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
if (cp->app_data == (void *) IP_VS_FTP_PASV) {
@@ -433,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index b58ddb7dffd1..a0921adc31a9 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 915ac8206076..000d961b97e4 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -159,7 +159,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -237,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 379140075e95..153d89647c87 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -148,7 +148,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -231,7 +231,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index e101eda05d55..71fc6d63a67f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -36,6 +36,7 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
@@ -275,7 +276,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return false;
ipv6_hdr(skb)->hop_limit--;
@@ -290,7 +291,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
/* Decrease ttl */
@@ -381,8 +382,13 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (!dest)
goto err_put;
- if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
if (mtu < 68) {
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto err_put;
@@ -536,8 +542,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
if (!dest)
goto err_put;
- if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
if (mtu < IPV6_MIN_MTU) {
IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
IPV6_MIN_MTU);
@@ -792,7 +803,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -881,7 +892,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1002,17 +1013,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb,
__be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
struct udphdr *udph; /* Our new UDP header */
struct guehdr *gueh; /* Our new GUE header */
+ size_t hdrlen, optlen = 0;
+ void *data;
+ bool need_priv = false;
+
+ if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ need_priv = true;
+ }
- skb_push(skb, sizeof(struct guehdr));
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
gueh = (struct guehdr *)skb->data;
gueh->control = 0;
gueh->version = 0;
- gueh->hlen = 0;
+ gueh->hlen = optlen >> 2;
gueh->flags = 0;
gueh->proto_ctype = *next_protocol;
+ data = &gueh[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd;
+
+ gueh->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd = data;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
skb_push(skb, sizeof(struct udphdr));
skb_reset_transport_header(skb);
@@ -1066,6 +1116,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
int tun_type, gso_type;
+ int tun_flags;
EnterFunction(10);
@@ -1088,9 +1139,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ }
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
@@ -1101,8 +1162,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
gso_type = __tun_gso_type_mask(AF_INET, cp->af);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- gso_type |= SKB_GSO_UDP_TUNNEL;
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ }
if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
@@ -1111,8 +1181,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_set_inner_ipproto(skb, next_protocol);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- ipvs_gue_encap(net, skb, cp, &next_protocol);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
+ }
+
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -1170,6 +1251,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
int tun_type, gso_type;
+ int tun_flags;
EnterFunction(10);
@@ -1193,9 +1275,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ }
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, &payload_len,
@@ -1204,8 +1296,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- gso_type |= SKB_GSO_UDP_TUNNEL;
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ }
if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
@@ -1214,8 +1315,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_set_inner_ipproto(skb, next_protocol);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- ipvs_gue_encap(net, skb, cp, &next_protocol);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
+ }
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
@@ -1400,7 +1511,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1489,7 +1600,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index e52fcb9c9a96..921a7b95be68 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -37,12 +37,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
- } endfor_ifa(in_dev);
+ }
}
if (mask == 0)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f4f9b8344a32..bdfeacee0817 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -749,9 +749,6 @@ begin:
continue;
}
- if (nf_ct_is_dying(ct))
- continue;
-
if (nf_ct_key_equal(h, tuple, zone, net))
return h;
}
@@ -777,20 +774,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conn *ct;
rcu_read_lock();
-begin:
+
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
+ /* We have a candidate that matches the tuple we're interested
+ * in, try to obtain a reference and re-check tuple
+ */
ct = nf_ct_tuplehash_to_ctrack(h);
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
- h = NULL;
- else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
- nf_ct_put(ct);
- goto begin;
- }
+ if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+ if (likely(nf_ct_key_equal(h, tuple, zone, net)))
+ goto found;
+
+ /* TYPESAFE_BY_RCU recycled the candidate */
+ nf_ct_put(ct);
}
+
+ h = NULL;
}
+found:
rcu_read_unlock();
return h;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index fac6986d37a8..6497e5fc0871 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -6,7 +6,7 @@
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*
* For more information, please see http://nath323.sourceforge.net/
*/
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 37bb530d848f..a0560d175a7f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -16,6 +16,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
#include <net/netfilter/nf_log.h>
#include <linux/ip.h>
@@ -120,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
};
EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
-static unsigned int nf_confirm(struct sk_buff *skb,
- unsigned int protoff,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
+unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
{
const struct nf_conn_help *help;
@@ -154,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb,
/* We've seen it coming out the other side: confirm it */
return nf_conntrack_confirm(skb);
}
+EXPORT_SYMBOL_GPL(nf_confirm);
static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
@@ -442,12 +442,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
return 0;
}
+static struct nf_ct_bridge_info *nf_ct_bridge_info;
+
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
- bool fixup_needed = false;
+ bool fixup_needed = false, retry = true;
int err = 0;
-
+retry:
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
@@ -487,6 +489,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
fixup_needed = true;
break;
#endif
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info) {
+ if (!retry) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ mutex_unlock(&nf_ct_proto_mutex);
+ request_module("nf_conntrack_bridge");
+ retry = false;
+ goto retry;
+ }
+ if (!try_module_get(nf_ct_bridge_info->me)) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ cnet->users_bridge++;
+ if (cnet->users_bridge > 1)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+ if (err)
+ cnet->users_bridge = 0;
+ else
+ fixup_needed = true;
+ break;
default:
err = -EPROTO;
break;
@@ -519,47 +547,99 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
ARRAY_SIZE(ipv6_conntrack_ops));
break;
#endif
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info)
+ break;
+ if (cnet->users_bridge && (--cnet->users_bridge == 0))
+ nf_unregister_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+
+ module_put(nf_ct_bridge_info->me);
+ break;
}
-
mutex_unlock(&nf_ct_proto_mutex);
}
-int nf_ct_netns_get(struct net *net, u8 nfproto)
+static int nf_ct_netns_inet_get(struct net *net)
{
int err;
- if (nfproto == NFPROTO_INET) {
- err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_do_get(net, nfproto);
- if (err < 0)
- goto err1;
- }
- return 0;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ return err;
err2:
nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
return err;
}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ switch (nfproto) {
+ case NFPROTO_INET:
+ err = nf_ct_netns_inet_get(net);
+ break;
+ case NFPROTO_BRIDGE:
+ err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE);
+ if (err < 0)
+ return err;
+
+ err = nf_ct_netns_inet_get(net);
+ if (err < 0) {
+ nf_ct_netns_put(net, NFPROTO_BRIDGE);
+ return err;
+ }
+ break;
+ default:
+ err = nf_ct_netns_do_get(net, nfproto);
+ break;
+ }
+ return err;
+}
EXPORT_SYMBOL_GPL(nf_ct_netns_get);
void nf_ct_netns_put(struct net *net, uint8_t nfproto)
{
- if (nfproto == NFPROTO_INET) {
+ switch (nfproto) {
+ case NFPROTO_BRIDGE:
+ nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
+ /* fall through */
+ case NFPROTO_INET:
nf_ct_netns_do_put(net, NFPROTO_IPV4);
nf_ct_netns_do_put(net, NFPROTO_IPV6);
- } else {
+ break;
+ default:
nf_ct_netns_do_put(net, nfproto);
+ break;
}
}
EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+void nf_ct_bridge_register(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = info;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_register);
+
+void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(!nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = NULL;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister);
+
int nf_conntrack_proto_init(void)
{
int ret;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 522c08c23600..fce3d93f1541 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -336,7 +336,7 @@ static bool sctp_error(struct sk_buff *skb,
if (state->hook == NF_INET_PRE_ROUTING &&
state->net->ct.sysctl_checksum &&
skb->ip_summed == CHECKSUM_NONE) {
- if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
+ if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) {
logmsg = "nf_ct_sctp: failed to read header ";
goto out_invalid;
}
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 1e2cc83ff5da..d5fdfa00d683 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*/
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dc21a43cd145..3066449f8bd8 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
tcph = (void *)skb->data + protoff;
@@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
- if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, protoff + sizeof(*tcph)))
return 0;
tcph = (void *)skb->data + protoff;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 948b4ebbe3fb..e3d797252a98 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
ft->dst_port = ctt->dst.u.tcp.port;
ft->iifidx = other_dst->dev->ifindex;
- ft->oifidx = dst->dev->ifindex;
ft->dst_cache = dst;
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 98bf543e9891..a263505455fc 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -95,7 +95,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
struct tcphdr *tcph;
int oldlen, datalen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
@@ -145,7 +145,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
struct udphdr *udph;
int datalen, oldlen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 07da07788f6b..888292e8fbb2 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -70,7 +70,7 @@ static bool udp_manip_pkt(struct sk_buff *skb,
struct udphdr *hdr;
bool do_csum;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
@@ -88,7 +88,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
struct udphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
@@ -114,7 +114,7 @@ sctp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(*hdr))
hdrsize = sizeof(*hdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -155,7 +155,7 @@ tcp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct tcphdr *)(skb->data + hdroff);
@@ -195,7 +195,7 @@ dccp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(struct dccp_hdr))
hdrsize = sizeof(struct dccp_hdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct dccp_hdr *)(skb->data + hdroff);
@@ -229,7 +229,7 @@ icmp_manip_pkt(struct sk_buff *skb,
{
struct icmphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmphdr *)(skb->data + hdroff);
@@ -247,7 +247,7 @@ icmpv6_manip_pkt(struct sk_buff *skb,
{
struct icmp6hdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmp6hdr *)(skb->data + hdroff);
@@ -275,7 +275,7 @@ gre_manip_pkt(struct sk_buff *skb,
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8))
return false;
greh = (void *)skb->data + hdroff;
@@ -347,7 +347,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
struct iphdr *iph;
unsigned int hdroff;
- if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
return false;
iph = (void *)skb->data + iphdroff;
@@ -378,7 +378,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
int hdroff;
u8 nexthdr;
- if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h)))
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h)))
return false;
ipv6h = (void *)skb->data + iphdroff;
@@ -562,7 +562,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
return 0;
if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
return 0;
@@ -784,7 +784,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
return 0;
if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
return 0;
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 4ffe5e5e65ba..f91579c821e9 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -44,15 +44,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
if (hooknum == NF_INET_LOCAL_OUT) {
newdst = htonl(0x7F000001);
} else {
- struct in_device *indev;
- struct in_ifaddr *ifa;
+ const struct in_device *indev;
newdst = 0;
indev = __in_dev_get_rcu(skb->dev);
- if (indev && indev->ifa_list) {
- ifa = indev->ifa_list;
- newdst = ifa->ifa_local;
+ if (indev) {
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(indev->ifa_list);
+ if (ifa)
+ newdst = ifa->ifa_local;
}
if (!newdst)
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 7de28fa0f14a..e338d91980d8 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -282,7 +282,7 @@ next:
if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
struct udphdr *uh;
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8ce74ed985c0..409722d23302 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -10,16 +10,16 @@
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_tcpudp.h>
-#include <linux/netfilter/xt_SYNPROXY.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_SYNPROXY.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_synproxy.h>
unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -57,7 +57,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
opts->mss = get_unaligned_be16(ptr);
- opts->options |= XT_SYNPROXY_OPT_MSS;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
}
break;
case TCPOPT_WINDOW:
@@ -65,19 +65,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
opts->wscale = *ptr;
if (opts->wscale > TCP_MAX_WSCALE)
opts->wscale = TCP_MAX_WSCALE;
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
}
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP) {
opts->tsval = get_unaligned_be32(ptr);
opts->tsecr = get_unaligned_be32(ptr + 4);
- opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ opts->options |= NF_SYNPROXY_OPT_TIMESTAMP;
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
- opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ opts->options |= NF_SYNPROXY_OPT_SACK_PERM;
break;
}
@@ -89,36 +89,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
-unsigned int synproxy_options_size(const struct synproxy_options *opts)
+static unsigned int
+synproxy_options_size(const struct synproxy_options *opts)
{
unsigned int size = 0;
- if (opts->options & XT_SYNPROXY_OPT_MSS)
+ if (opts->options & NF_SYNPROXY_OPT_MSS)
size += TCPOLEN_MSS_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
size += TCPOLEN_TSTAMP_ALIGNED;
- else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
size += TCPOLEN_SACKPERM_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE)
size += TCPOLEN_WSCALE_ALIGNED;
return size;
}
-EXPORT_SYMBOL_GPL(synproxy_options_size);
-void
+static void
synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
{
__be32 *ptr = (__be32 *)(th + 1);
u8 options = opts->options;
- if (options & XT_SYNPROXY_OPT_MSS)
+ if (options & NF_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
opts->mss);
- if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
- if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
@@ -131,58 +131,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
- } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ } else if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
- if (options & XT_SYNPROXY_OPT_WSCALE)
+ if (options & NF_SYNPROXY_OPT_WSCALE)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->wscale);
}
-EXPORT_SYMBOL_GPL(synproxy_build_options);
-void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
opts->tsval = tcp_time_stamp_raw() & ~0x3f;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
opts->wscale = info->wscale;
} else
opts->tsval |= 0xf;
- if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
opts->tsval |= 1 << 4;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
opts->tsval |= 1 << 5;
}
EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
-void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+static void
+synproxy_check_timestamp_cookie(struct synproxy_options *opts)
{
opts->wscale = opts->tsecr & 0xf;
if (opts->wscale != 0xf)
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
- opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+ opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0;
- opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+ opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0;
}
-EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
-unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
- unsigned int protoff,
- struct tcphdr *th,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- const struct nf_conn_synproxy *synproxy)
+static unsigned int
+synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
+ struct tcphdr *th, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
__be32 *ptr, old;
@@ -193,7 +191,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + th->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
while (optoff < optend) {
@@ -232,7 +230,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
}
return 1;
}
-EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
.len = sizeof(struct nf_conn_synproxy),
@@ -413,5 +410,830 @@ static void __exit synproxy_core_exit(void)
module_init(synproxy_core_init);
module_exit(synproxy_core_exit);
+static struct iphdr *
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = net->ipv4.sysctl_ip_default_ttl;
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack);
+
+static void
+synproxy_send_server_syn(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(struct net *net,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack);
+
+unsigned int
+ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb) ||
+ ip_hdr(skb)->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv4_synproxy_hook);
+
+static const struct nf_hook_ops ipv4_synproxy_ops[] = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref4 == 0) {
+ err = nf_register_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err)
+ return err;
+ }
+
+ snet->hook_ref4++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
+
+void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref4--;
+ if (snet->hook_ref4 == 0)
+ nf_unregister_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct ipv6hdr *
+synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ struct ipv6hdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ ip6_flow_hdr(iph, 0, 0);
+ iph->hop_limit = net->ipv6.devconf_all->hop_limit;
+ iph->nexthdr = IPPROTO_TCP;
+ iph->saddr = *saddr;
+ iph->daddr = *daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp_ipv6(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct ipv6hdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = niph->saddr;
+ fl6.daddr = niph->daddr;
+ fl6.fl6_sport = nth->source;
+ fl6.fl6_dport = nth->dest;
+ security_skb_classify_flow((struct sk_buff *)skb,
+ flowi6_to_flowi(&fl6));
+ err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (err) {
+ goto free_nskb;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ goto free_nskb;
+
+ skb_dst_set(nskb, dst);
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip6_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6);
+
+static void
+synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general,
+ IP_CT_NEW, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth,
+ tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6);
+
+unsigned int
+ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ __be16 frag_off;
+ u8 nexthdr;
+ int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (thoff < 0 || nexthdr != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack_ipv6(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack_ipv6(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv6_synproxy_hook);
+
+static const struct nf_hook_ops ipv6_synproxy_ops[] = {
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int
+nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref6 == 0) {
+ err = nf_register_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+ if (err)
+ return err;
+ }
+
+ snet->hook_ref6++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
+
+void
+nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref6--;
+ if (snet->hook_ref6 == 0)
+ nf_unregister_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
+#endif /* CONFIG_IPV6 */
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bcf17fb46d96..cae5c46e2dd4 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1446,25 +1446,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct net *net,
- struct nft_base_chain *chain,
- struct nft_stats __percpu *newstats)
+static void nft_chain_stats_replace(struct nft_trans *trans)
{
- struct nft_stats __percpu *oldstats;
+ struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain);
- if (newstats == NULL)
+ if (!nft_trans_chain_stats(trans))
return;
- if (rcu_access_pointer(chain->stats)) {
- oldstats = rcu_dereference_protected(chain->stats,
- lockdep_commit_lock_is_held(net));
- rcu_assign_pointer(chain->stats, newstats);
- synchronize_rcu();
- free_percpu(oldstats);
- } else {
- rcu_assign_pointer(chain->stats, newstats);
+ rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans),
+ lockdep_commit_lock_is_held(trans->ctx.net));
+
+ if (!nft_trans_chain_stats(trans))
static_branch_inc(&nft_counters_enabled);
- }
}
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
@@ -3877,6 +3870,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
[NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 },
[NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED },
@@ -4330,7 +4324,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *data,
- u64 timeout, gfp_t gfp)
+ u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
@@ -4345,9 +4339,11 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_key(ext), key, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
memcpy(nft_set_ext_data(ext), data, set->dlen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
- *nft_set_ext_expiration(ext) =
- get_jiffies_64() + timeout;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
+ *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
+ if (expiration == 0)
+ *nft_set_ext_expiration(ext) += timeout;
+ }
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
@@ -4412,6 +4408,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_trans *trans;
u32 flags = 0;
u64 timeout;
+ u64 expiration;
u8 ulen;
int err;
@@ -4455,6 +4452,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
timeout = set->timeout;
}
+ expiration = 0;
+ if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
+ &expiration);
+ if (err)
+ return err;
+ }
+
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
@@ -4537,7 +4544,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
- timeout, GFP_KERNEL);
+ timeout, expiration, GFP_KERNEL);
if (elem.priv == NULL)
goto err3;
@@ -4739,7 +4746,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
- GFP_KERNEL);
+ 0, GFP_KERNEL);
if (elem.priv == NULL)
goto err2;
@@ -6359,9 +6366,9 @@ static void nft_chain_commit_update(struct nft_trans *trans)
if (!nft_is_base_chain(trans->ctx.chain))
return;
+ nft_chain_stats_replace(trans);
+
basechain = nft_base_chain(trans->ctx.chain);
- nft_chain_stats_replace(trans->ctx.net, basechain,
- nft_trans_chain_stats(trans));
switch (nft_trans_chain_policy(trans)) {
case NF_DROP:
@@ -6378,6 +6385,7 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_table_destroy(&trans->ctx);
break;
case NFT_MSG_NEWCHAIN:
+ free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
break;
case NFT_MSG_DELCHAIN:
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f42326b40d6f..9f5dea0064ea 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
{
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+ const struct in_ifaddr *ifa;
int ret = 0;
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
- endfor_ifa(in_dev);
-
return ret;
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 89750f74e3a2..b6a7ce622c72 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -859,7 +859,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
}
skb_put(e->skb, diff);
}
- if (!skb_make_writable(e->skb, data_len))
+ if (skb_ensure_writable(e->skb, data_len))
return -ENOMEM;
skb_copy_to_linear_data(e->skb, data, data_len);
e->skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index dfcdea6619f1..827ab6196df9 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -21,6 +21,7 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
struct nft_ct {
enum nft_ct_keys key:8;
@@ -1153,6 +1154,135 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
+struct nft_ct_expect_obj {
+ u16 l3num;
+ __be16 dport;
+ u8 l4proto;
+ u8 size;
+ u32 timeout;
+};
+
+static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (!tb[NFTA_CT_EXPECT_L4PROTO] ||
+ !tb[NFTA_CT_EXPECT_DPORT] ||
+ !tb[NFTA_CT_EXPECT_TIMEOUT] ||
+ !tb[NFTA_CT_EXPECT_SIZE])
+ return -EINVAL;
+
+ priv->l3num = ctx->family;
+ if (tb[NFTA_CT_EXPECT_L3PROTO])
+ priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+
+ priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
+ priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
+ priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
+
+ return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_ct_expect_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) ||
+ nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) ||
+ nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size))
+ return -1;
+
+ return 0;
+}
+
+static void nft_ct_expect_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+ struct nf_conntrack_expect *exp;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_help *help;
+ enum ip_conntrack_dir dir;
+ u16 l3num = priv->l3num;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_UNTRACKED) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ dir = CTINFO2DIR(ctinfo);
+
+ help = nfct_help(ct);
+ if (!help)
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+ if (!help) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ if (l3num == NFPROTO_INET)
+ l3num = nf_ct_l3num(ct);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ priv->l4proto, NULL, &priv->dport);
+ exp->timeout.expires = jiffies + priv->timeout * HZ;
+
+ if (nf_ct_expect_related(exp) != 0)
+ regs->verdict.code = NF_DROP;
+}
+
+static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = {
+ [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 },
+ [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 },
+};
+
+static struct nft_object_type nft_ct_expect_obj_type;
+
+static const struct nft_object_ops nft_ct_expect_obj_ops = {
+ .type = &nft_ct_expect_obj_type,
+ .size = sizeof(struct nft_ct_expect_obj),
+ .eval = nft_ct_expect_obj_eval,
+ .init = nft_ct_expect_obj_init,
+ .destroy = nft_ct_expect_obj_destroy,
+ .dump = nft_ct_expect_obj_dump,
+};
+
+static struct nft_object_type nft_ct_expect_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CT_EXPECT,
+ .ops = &nft_ct_expect_obj_ops,
+ .maxattr = NFTA_CT_EXPECT_MAX,
+ .policy = nft_ct_expect_policy,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_ct_module_init(void)
{
int err;
@@ -1170,17 +1300,23 @@ static int __init nft_ct_module_init(void)
err = nft_register_obj(&nft_ct_helper_obj_type);
if (err < 0)
goto err2;
+
+ err = nft_register_obj(&nft_ct_expect_obj_type);
+ if (err < 0)
+ goto err3;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
err = nft_register_obj(&nft_ct_timeout_obj_type);
if (err < 0)
- goto err3;
+ goto err4;
#endif
return 0;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err4:
+ nft_unregister_obj(&nft_ct_expect_obj_type);
+#endif
err3:
nft_unregister_obj(&nft_ct_helper_obj_type);
-#endif
err2:
nft_unregister_expr(&nft_notrack_type);
err1:
@@ -1193,6 +1329,7 @@ static void __exit nft_ct_module_exit(void)
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
nft_unregister_obj(&nft_ct_timeout_obj_type);
#endif
+ nft_unregister_obj(&nft_ct_expect_obj_type);
nft_unregister_obj(&nft_ct_helper_obj_type);
nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
@@ -1207,3 +1344,4 @@ MODULE_ALIAS_NFT_EXPR("ct");
MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 505bdfc66801..33833a0cb989 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -56,7 +56,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
elem = nft_set_elem_init(set, &priv->tmpl,
&regs->data[priv->sreg_key],
&regs->data[priv->sreg_data],
- timeout, GFP_ATOMIC);
+ timeout, 0, GFP_ATOMIC);
if (elem == NULL)
goto err1;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a7aa6c5250a4..a5e8469859e3 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,6 +59,103 @@ err:
regs->verdict.code = NFT_BREAK;
}
+/* find the offset to specified option.
+ *
+ * If target header is found, its offset is set in *offset and return option
+ * number. Otherwise, return negative error.
+ *
+ * If the first fragment doesn't contain the End of Options it is considered
+ * invalid.
+ */
+static int ipv4_find_option(struct net *net, struct sk_buff *skb,
+ unsigned int *offset, int target)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct iphdr *iph, _iph;
+ unsigned int start;
+ bool found = false;
+ __be32 info;
+ int optlen;
+
+ iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!iph)
+ return -EBADMSG;
+ start = sizeof(struct iphdr);
+
+ optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
+ if (optlen <= 0)
+ return -ENOENT;
+
+ memset(opt, 0, sizeof(struct ip_options));
+ /* Copy the options since __ip_options_compile() modifies
+ * the options.
+ */
+ if (skb_copy_bits(skb, start, opt->__data, optlen))
+ return -EBADMSG;
+ opt->optlen = optlen;
+
+ if (__ip_options_compile(net, opt, NULL, &info))
+ return -EBADMSG;
+
+ switch (target) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (!opt->srr)
+ break;
+ found = target == IPOPT_SSRR ? opt->is_strictroute :
+ !opt->is_strictroute;
+ if (found)
+ *offset = opt->srr + start;
+ break;
+ case IPOPT_RR:
+ if (!opt->rr)
+ break;
+ *offset = opt->rr + start;
+ found = true;
+ break;
+ case IPOPT_RA:
+ if (!opt->router_alert)
+ break;
+ *offset = opt->router_alert + start;
+ found = true;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return found ? target : -ENOENT;
+}
+
+static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct sk_buff *skb = pkt->skb;
+ unsigned int offset;
+ int err;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto err;
+
+ err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ *dest = (err >= 0);
+ return;
+ } else if (err < 0) {
+ goto err;
+ }
+ offset += priv->offset;
+
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ goto err;
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
static void *
nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
unsigned int len, void *buffer, unsigned int *tcphdr_len)
@@ -153,7 +250,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
return;
- if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len))
+ if (skb_ensure_writable(pkt->skb,
+ pkt->xt.thoff + i + priv->len))
return;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
@@ -311,6 +409,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
return nft_validate_register_load(priv->sreg, priv->len);
}
+static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ switch (priv->type) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ case IPOPT_RR:
+ case IPOPT_RA:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
{
if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -357,6 +477,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.dump = nft_exthdr_dump,
};
+static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_ipv4_eval,
+ .init = nft_exthdr_ipv4_init,
+ .dump = nft_exthdr_dump,
+};
+
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -397,6 +525,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
break;
+ case NFT_EXTHDR_OP_IPV4:
+ if (ctx->family != NFPROTO_IPV6) {
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_ipv4_ops;
+ }
+ break;
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 680bd9f38a81..1260f78a034d 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -240,7 +240,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
tsum));
}
- if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) ||
skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
return -1;
@@ -256,7 +256,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return -1;
nft_csum_replace(&sum, fsum, tsum);
- if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) ||
skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
return -1;
@@ -309,7 +309,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
goto err;
}
- if (!skb_make_writable(skb, max(offset + priv->len, 0)) ||
+ if (skb_ensure_writable(skb, max(offset + priv->len, 0)) ||
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index b1054a3d18c5..eababc354ff1 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -31,7 +31,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
ipv4_change_dsfield(ip_hdr(skb),
@@ -49,7 +49,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
ipv6_change_dsfield(ipv6_hdr(skb),
@@ -79,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ip_hdr(skb);
ipv4_change_dsfield(iph, 0, nv);
@@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ipv6_hdr(skb);
ipv6_change_dsfield(iph, 0, nv);
diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c
index 8221a5ce44bf..7873b834c300 100644
--- a/net/netfilter/xt_HL.c
+++ b/net/netfilter/xt_HL.c
@@ -29,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct ipt_TTL_info *info = par->targinfo;
int new_ttl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*iph)))
return NF_DROP;
iph = ip_hdr(skb);
@@ -69,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)
const struct ip6t_HL_info *info = par->targinfo;
int new_hl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*ip6h)))
return NF_DROP;
ip6h = ipv6_hdr(skb);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 0b3a1b291c91..122db9fbb9f4 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (par->fragoff != 0)
return 0;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return -1;
len = skb->len - tcphoff;
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 666f4ca9b15f..30e99464171b 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -28,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
static unsigned int
tcpoptstrip_mangle_packet(struct sk_buff *skb,
const struct xt_action_param *par,
- unsigned int tcphoff, unsigned int minlen)
+ unsigned int tcphoff)
{
const struct xt_tcpoptstrip_target_info *info = par->targinfo;
+ struct tcphdr *tcph, _th;
unsigned int optl, i, j;
- struct tcphdr *tcph;
u_int16_t n, o;
u_int8_t *opt;
- int len, tcp_hdrlen;
+ int tcp_hdrlen;
/* This is a fragment, no TCP header is available */
if (par->fragoff != 0)
return XT_CONTINUE;
- if (!skb_make_writable(skb, skb->len))
+ tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th);
+ if (!tcph)
return NF_DROP;
- len = skb->len - tcphoff;
- if (len < (int)sizeof(struct tcphdr))
- return NF_DROP;
-
- tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
tcp_hdrlen = tcph->doff * 4;
+ if (tcp_hdrlen < sizeof(struct tcphdr))
+ return NF_DROP;
- if (len < tcp_hdrlen)
+ if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen))
return NF_DROP;
- opt = (u_int8_t *)tcph;
+ /* must reload tcph, might have been moved */
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ opt = (u8 *)tcph;
/*
* Walk through all TCP options - if we find some option to remove,
@@ -88,8 +88,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
static unsigned int
tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
- sizeof(struct iphdr) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
@@ -106,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (tcphoff < 0)
return NF_DROP;
- return tcpoptstrip_mangle_packet(skb, par, tcphoff,
- sizeof(*ipv6h) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff);
}
#endif
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 140ce6be639a..0c9e014e30b4 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -2,7 +2,7 @@
/*
* xt_iprange - Netfilter module to match IP address ranges
*
- * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) CC Computer Consultants GmbH, 2008
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -130,7 +130,7 @@ static void __exit iprange_mt_exit(void)
module_init(iprange_mt_init);
module_exit(iprange_mt_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
MODULE_ALIAS("ipt_iprange");
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 95f64a99e425..e85ce69924ae 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -22,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par)
struct xt_owner_match_info *info = par->matchinfo;
struct net *net = par->net;
+ if (info->match & ~XT_OWNER_MASK)
+ return -EINVAL;
+
/* Only allow the common case where the userns of the writer
* matches the userns of the network namespace.
*/
@@ -88,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
if (info->match & XT_OWNER_GID) {
+ unsigned int i, match = false;
kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
- if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
- gid_lte(filp->f_cred->fsgid, gid_max)) ^
- !(info->invert & XT_OWNER_GID))
+ struct group_info *gi = filp->f_cred->group_info;
+
+ if (gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max))
+ match = true;
+
+ if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) {
+ for (i = 0; i < gi->ngroups; ++i) {
+ kgid_t group = gi->gid[i];
+
+ if (gid_gte(group, gid_min) &&
+ gid_lte(group, gid_max)) {
+ match = true;
+ break;
+ }
+ }
+ }
+
+ if (match ^ !(info->invert & XT_OWNER_GID))
return false;
}
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index f099228cb9c4..ecbfa291fb70 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -2,7 +2,7 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module which implements the set match and SET target
@@ -18,7 +18,7 @@
#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IP set match and target module");
MODULE_ALIAS("xt_SET");
MODULE_ALIAS("ipt_set");
@@ -436,6 +436,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
ip_set_id_t index;
+ int ret = 0;
if (info->add_set.index != IPSET_INVALID_ID) {
index = ip_set_nfnl_get_byindex(par->net,
@@ -453,17 +454,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find del_set index %u as target\n",
info->del_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_add;
}
}
if (info->map_set.index != IPSET_INVALID_ID) {
if (strncmp(par->table, "mangle", 7)) {
pr_info_ratelimited("--map-set only usable from mangle table\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
(info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
@@ -471,20 +471,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
1 << NF_INET_LOCAL_OUT |
1 << NF_INET_POST_ROUTING))) {
pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find map_set index %u as target\n",
info->map_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->del_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_del;
}
}
@@ -492,16 +488,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
info->del_set.dim > IPSET_DIM_MAX ||
info->map_set.dim > IPSET_DIM_MAX) {
pr_info_ratelimited("SET target dimension over the limit!\n");
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->del_set.index);
- if (info->map_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->map_set.index);
- return -ERANGE;
+ ret = -ERANGE;
+ goto cleanup_mark;
}
return 0;
+cleanup_mark:
+ if (info->map_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->map_set.index);
+cleanup_del:
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+cleanup_add:
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return ret;
}
static void
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e9ddfd782d16..90b2ab9dd449 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -241,13 +241,8 @@ static __net_init int netlink_tap_init_net(struct net *net)
return 0;
}
-static void __net_exit netlink_tap_exit_net(struct net *net)
-{
-}
-
static struct pernet_operations netlink_tap_net_ops = {
.init = netlink_tap_init_net,
- .exit = netlink_tap_exit_net,
.id = &netlink_tap_net_id,
.size = sizeof(struct netlink_tap_net),
};
@@ -2544,12 +2539,10 @@ struct nl_seq_iter {
int link;
};
-static int netlink_walk_start(struct nl_seq_iter *iter)
+static void netlink_walk_start(struct nl_seq_iter *iter)
{
rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
rhashtable_walk_start(&iter->hti);
-
- return 0;
}
static void netlink_walk_stop(struct nl_seq_iter *iter)
@@ -2565,8 +2558,6 @@ static void *__netlink_seq_next(struct seq_file *seq)
do {
for (;;) {
- int err;
-
nlk = rhashtable_walk_next(&iter->hti);
if (IS_ERR(nlk)) {
@@ -2583,9 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq)
if (++iter->link >= MAX_LINKS)
return NULL;
- err = netlink_walk_start(iter);
- if (err)
- return ERR_PTR(err);
+ netlink_walk_start(iter);
}
} while (sock_net(&nlk->sk) != seq_file_net(seq));
@@ -2597,13 +2586,10 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
struct nl_seq_iter *iter = seq->private;
void *obj = SEQ_START_TOKEN;
loff_t pos;
- int err;
iter->link = 0;
- err = netlink_walk_start(iter);
- if (err)
- return ERR_PTR(err);
+ netlink_walk_start(iter);
for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
obj = __netlink_seq_next(seq);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6747bc57b6fa..33b388103741 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1334,7 +1334,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
&flow->id, info, false, ufid_flags);
if (likely(reply)) {
- if (likely(!IS_ERR(reply))) {
+ if (!IS_ERR(reply)) {
rcu_read_lock(); /*To keep RCU checker happy. */
err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
reply, info->snd_portid,
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index f927de9bda0a..3fc38d16c456 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -248,8 +248,6 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
*/
void ovs_vport_del(struct vport *vport)
{
- ASSERT_OVSL();
-
hlist_del_rcu(&vport->hash_node);
module_put(vport->ops->owner);
vport->ops->destroy(vport);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5f78df080573..8d54f3047768 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb();
}
-static int __packet_get_status(struct packet_sock *po, void *frame)
+static int __packet_get_status(const struct packet_sock *po, void *frame)
{
union tpacket_uhdr h;
@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status;
}
-static void *packet_lookup_frame(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int position,
- int status)
+static void *packet_lookup_frame(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int position,
+ int status)
{
unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h;
@@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk;
- if (po->stats.stats3.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
@@ -1003,7 +1003,6 @@ static void prb_fill_curr_block(char *curr,
/* Assumes caller has the sk->rx_queue.lock */
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct sk_buff *skb,
- int status,
unsigned int len
)
{
@@ -1075,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po,
po->rx_ring.head, status);
return curr;
case TPACKET_V3:
- return __packet_lookup_frame_in_block(po, skb, status, len);
+ return __packet_lookup_frame_in_block(po, skb, len);
default:
WARN(1, "TPACKET version not supported\n");
BUG();
@@ -1083,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
}
}
-static void *prb_lookup_block(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int idx,
- int status)
+static void *prb_lookup_block(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int idx,
+ int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
@@ -1199,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.frame_max + 1;
- idx = po->rx_ring.head;
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
+ idx = READ_ONCE(po->rx_ring.head);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1212,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.prb_bdqc.knum_blocks;
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1225,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+static int __packet_rcv_has_room(const struct packet_sock *po,
+ const struct sk_buff *skb)
{
- struct sock *sk = &po->sk;
+ const struct sock *sk = &po->sk;
int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) {
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- - (skb ? skb->truesize : 0);
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
@@ -1258,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
- int ret;
- bool has_room;
+ int pressure, ret;
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
- has_room = ret == ROOM_NORMAL;
- if (po->pressure == has_room)
- po->pressure = !has_room;
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+ pressure = ret != ROOM_NORMAL;
+
+ if (READ_ONCE(po->pressure) != pressure)
+ WRITE_ONCE(po->pressure, pressure);
return ret;
}
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
+{
+ if (READ_ONCE(po->pressure) &&
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ WRITE_ONCE(po->pressure, 0);
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1351,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(f->arr[i]);
- if (po_next != po_skip && !po_next->pressure &&
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
@@ -2126,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
is_drop_n_account = true;
- spin_lock(&sk->sk_receive_queue.lock);
- po->stats.stats1.tp_drops++;
+ atomic_inc(&po->tp_drops);
atomic_inc(&sk->sk_drops);
- spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -2193,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res)
goto drop_n_restore;
+ /* If we are flooded, just give up */
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
+
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
@@ -2263,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
- if (po->stats.stats1.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
@@ -2379,9 +2390,9 @@ drop:
return 0;
drop_n_account:
- is_drop_n_account = true;
- po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
+ atomic_inc(&po->tp_drops);
+ is_drop_n_account = true;
sk->sk_data_ready(sk);
kfree_skb(copy_skb);
@@ -3318,8 +3329,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
- if (pkt_sk(sk)->pressure)
- packet_rcv_has_room(pkt_sk(sk), NULL);
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len);
@@ -3891,6 +3901,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val;
union tpacket_stats_u st;
struct tpacket_rollover_stats rstats;
+ int drops;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3907,14 +3918,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock);
+ drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
- st.stats3.tp_packets += st.stats3.tp_drops;
+ st.stats3.tp_drops = drops;
+ st.stats3.tp_packets += drops;
data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
- st.stats1.tp_packets += st.stats1.tp_drops;
+ st.stats1.tp_drops = drops;
+ st.stats1.tp_packets += drops;
data = &st.stats1;
}
@@ -4133,8 +4147,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM;
}
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
- po->pressure = 0;
+ packet_rcv_try_clear_pressure(po);
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index c70a2794456f..82fb2b10f790 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -132,6 +132,7 @@ struct packet_sock {
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp;
+ atomic_t tp_drops ____cacheline_aligned_in_smp;
};
static struct packet_sock *pkt_sk(struct sock *sk)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b8d581b779b2..ec05d91aa9a2 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
rds_ib_get_mr_info(rds_ibdev, iinfo);
+ iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
}
return 1;
}
@@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo6->max_send_sge = rds_ibdev->max_sge;
rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+ iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
}
return 1;
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2c72d95c3050..360fdd3eaa77 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -877,6 +877,23 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CTINFO
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored information.
+ Current actions transfer connmark stored DSCP into
+ ipv4/v6 diffserv and/or to transfer connmark to packet
+ mark. Both are useful for restoring egress based marks
+ back onto ingress connections for qdisc priority mapping
+ purposes.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ctinfo.
+
config NET_ACT_SKBMOD
tristate "skb data modification action"
depends on NET_CLS_ACT
@@ -924,14 +941,6 @@ config NET_IFE_SKBTCINDEX
tristate "Support to encoding decoding skb tcindex on IFE action"
depends on NET_ACT_IFE
-config NET_CLS_IND
- bool "Incoming device classification"
- depends on NET_CLS_U32 || NET_CLS_FW
- ---help---
- Say Y here to extend the u32 and fw classifier to support
- classification based on the incoming device. This option is
- likely to disappear in favour of the metadata ematch.
-
endif # NET_SCHED
config NET_SCH_FIFO
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..d54bfcbd7981 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 000000000000..10eb2bb99861
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+static unsigned int ctinfo_net_id;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb, int wlen, int proto)
+{
+ u8 dscp, newdscp;
+
+ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv4_change_dsfield(ip_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb)
+{
+ ca->stats_cpmark_set++;
+ skb->mark = ct->mark & cp->cpmarkmask;
+}
+
+static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct tcf_ctinfo *ca = to_ctinfo(a);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone zone;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ctinfo_params *cp;
+ struct nf_conn *ct;
+ int proto, wlen;
+ int action;
+
+ cp = rcu_dereference_bh(ca->params);
+
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+ action = READ_ONCE(ca->tcf_action);
+
+ wlen = skb_network_offset(skb);
+ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder, usually ingress */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, cp->net, &tuple))
+ goto out;
+ zone.id = cp->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (cp->mode & CTINFO_MODE_DSCP)
+ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+ if (cp->mode & CTINFO_MODE_CPMARK)
+ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+ if (thash)
+ nf_ct_put(ct);
+out:
+ return action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+ [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct
+ tc_ctinfo) },
+ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
+ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+ struct nlattr *tb[TCA_CTINFO_MAX + 1];
+ struct tcf_ctinfo_params *cp_new;
+ struct tcf_chain *goto_ch = NULL;
+ u32 dscpmask = 0, dscpstatemask;
+ struct tc_ctinfo *actparm;
+ struct tcf_ctinfo *ci;
+ u8 dscpmaskshift;
+ int ret = 0, err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CTINFO_ACT]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing required TCA_CTINFO_ACT attribute");
+ return -EINVAL;
+ }
+ actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+ /* do some basic validation here before dynamically allocating things */
+ /* that we would otherwise have to clean up. */
+ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+ /* need contiguous 6 bit mask */
+ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_MASK],
+ "dscp mask must be 6 contiguous bits");
+ return -EINVAL;
+ }
+ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
+ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
+ /* mask & statemask must not overlap */
+ if (dscpmask & dscpstatemask) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ "dscp statemask must not overlap dscp mask");
+ return -EINVAL;
+ }
+ }
+
+ /* done the validation:now to the actual action allocation */
+ err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, actparm->index, est, a,
+ &act_ctinfo_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, actparm->index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind) /* don't override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ ci = to_ctinfo(*a);
+
+ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+ if (unlikely(!cp_new)) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ cp_new->net = net;
+ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
+ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
+ if (dscpmask) {
+ cp_new->dscpmask = dscpmask;
+ cp_new->dscpmaskshift = dscpmaskshift;
+ cp_new->dscpstatemask = dscpstatemask;
+ cp_new->mode |= CTINFO_MODE_DSCP;
+ }
+
+ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+ cp_new->cpmarkmask =
+ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+ cp_new->mode |= CTINFO_MODE_CPMARK;
+ }
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+ rcu_swap_protected(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (cp_new)
+ kfree_rcu(cp_new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tc_ctinfo opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ctinfo_params *cp;
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ cp = rcu_dereference_protected(ci->params,
+ lockdep_is_held(&ci->tcf_lock));
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ opt.action = ci->tcf_action;
+ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+ goto nla_put_failure;
+
+ if (cp->mode & CTINFO_MODE_DSCP) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+ cp->dscpmask))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+ cp->dscpstatemask))
+ goto nla_put_failure;
+ }
+
+ if (cp->mode & CTINFO_MODE_CPMARK) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+ cp->cpmarkmask))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+ ci->stats_dscp_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+ ci->stats_dscp_error, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+ ci->stats_cpmark_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ci->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+ .kind = "ctinfo",
+ .id = TCA_ID_CTINFO,
+ .owner = THIS_MODULE,
+ .act = tcf_ctinfo_act,
+ .dump = tcf_ctinfo_dump,
+ .init = tcf_ctinfo_init,
+ .walk = tcf_ctinfo_walker,
+ .lookup = tcf_ctinfo_search,
+ .size = sizeof(struct tcf_ctinfo),
+};
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tc_action_net_init(tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ctinfo_net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+ .init = ctinfo_init_net,
+ .exit_batch = ctinfo_exit_net,
+ .id = &ctinfo_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eedd5786c084..ce2e9b1c9850 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -27,7 +27,7 @@
#include <net/dst_metadata.h>
struct fl_flow_key {
- int indev_ifindex;
+ struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
@@ -284,7 +284,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
list_for_each_entry_rcu(mask, &head->masks, list) {
fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
+ skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
@@ -1021,15 +1021,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
{
__be16 ethertype;
int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
if (err < 0)
return err;
- key->indev_ifindex = err;
- mask->indev_ifindex = 0xffffffff;
+ key->meta.ingress_ifindex = err;
+ mask->meta.ingress_ifindex = 0xffffffff;
}
-#endif
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -1282,6 +1281,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_META, meta);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -2123,10 +2124,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb,
static int fl_dump_key(struct sk_buff *skb, struct net *net,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
- if (mask->indev_ifindex) {
+ if (mask->meta.ingress_ifindex) {
struct net_device *dev;
- dev = __dev_get_by_index(net, key->indev_ifindex);
+ dev = __dev_get_by_index(net, key->meta.ingress_ifindex);
if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
goto nla_put_failure;
}
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 4dab833f66cb..c9496c920d6f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -8,9 +8,6 @@
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
*/
#include <linux/module.h>
@@ -37,9 +34,7 @@ struct fw_filter {
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_work rwork;
@@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
-#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
@@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &f->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
return ret;
f->ifindex = ret;
}
-#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
@@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
fnew->id = f->id;
fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
@@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
-#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 38c0a9f0f296..a30d2f8feb32 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
unsigned int in_hw_count;
struct tc_matchall_pcnt __percpu *pf;
struct rcu_work rwork;
+ bool deleting;
};
static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -258,7 +259,11 @@ err_exts_init:
static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- return -EOPNOTSUPP;
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ head->deleting = true;
+ *last = true;
+ return 0;
}
static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
@@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
if (arg->count < arg->skip)
goto skip;
- if (!head)
+ if (!head || head->deleting)
return;
if (arg->fn(tp, head, arg) < 0)
arg->stop = 1;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c7727de5e073..be9e46c77e8b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -20,9 +20,6 @@
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
@@ -48,9 +45,7 @@ struct tc_u_knode {
u32 handle;
struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode __rcu *ht_down;
@@ -176,12 +171,10 @@ check_terminal:
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->ifindex)) {
n = rcu_dereference_bh(n->next);
goto next_knode;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
__this_cpu_inc(n->pf->rhit);
#endif
@@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &n->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
@@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
return -EINVAL;
n->ifindex = ret;
}
-#endif
return 0;
}
@@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
new->handle = n->handle;
RCU_INIT_POINTER(new->ht_up, n->ht_up);
-#ifdef CONFIG_NET_CLS_IND
new->ifindex = n->ifindex;
-#endif
new->fshift = n->fshift;
new->res = n->res;
new->flags = n->flags;
@@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (n->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, n->ifindex);
if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
goto nla_put_failure;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
n->sel.nkeys * sizeof(u64),
@@ -1422,9 +1409,7 @@ static int __init init_u32(void)
#ifdef CONFIG_CLS_U32_PERF
pr_info(" Performance counters on\n");
#endif
-#ifdef CONFIG_NET_CLS_IND
pr_info(" input device check on\n");
-#endif
#ifdef CONFIG_NET_CLS_ACT
pr_info(" Actions configured\n");
#endif
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 0f65f617756b..599730f804d7 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -114,6 +114,7 @@ nla_put_failure:
}
static const struct Qdisc_class_ops ingress_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = ingress_find,
.walk = ingress_walk,
@@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch)
}
static const struct Qdisc_class_ops clsact_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = clsact_find,
.walk = ingress_walk,
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 2cae7440349c..74847d613835 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -94,11 +94,6 @@ static const struct net_offload sctp6_offload = {
},
};
-static const struct skb_checksum_ops crc32c_csum_ops = {
- .update = sctp_csum_update,
- .combine = sctp_csum_combine,
-};
-
int __init sctp_offload_init(void)
{
int ret;
@@ -111,7 +106,7 @@ int __init sctp_offload_init(void)
if (ret)
goto ipv4;
- crc32c_csum_stub = &crc32c_csum_ops;
+ crc32c_csum_stub = &sctp_csum_ops;
return ret;
ipv4:
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 23af232c0a25..2d47adcb4cbe 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -81,7 +81,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
return;
}
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
/* Add the address to the local list. */
addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
if (addr) {
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 7621ec2f539c..302e355f2ebc 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -123,30 +123,11 @@ struct proto smc_proto6 = {
};
EXPORT_SYMBOL_GPL(smc_proto6);
-static int smc_release(struct socket *sock)
+static int __smc_release(struct smc_sock *smc)
{
- struct sock *sk = sock->sk;
- struct smc_sock *smc;
+ struct sock *sk = &smc->sk;
int rc = 0;
- if (!sk)
- goto out;
-
- smc = smc_sk(sk);
-
- /* cleanup for a dangling non-blocking connect */
- if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
- tcp_abort(smc->clcsock->sk, ECONNABORTED);
- flush_work(&smc->connect_work);
-
- if (sk->sk_state == SMC_LISTEN)
- /* smc_close_non_accepted() is called and acquires
- * sock lock for child sockets again
- */
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- else
- lock_sock(sk);
-
if (!smc->use_fallback) {
rc = smc_close_active(smc);
sock_set_flag(sk, SOCK_DEAD);
@@ -174,6 +155,35 @@ static int smc_release(struct socket *sock)
smc_conn_free(&smc->conn);
}
+ return rc;
+}
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = 0;
+
+ if (!sk)
+ goto out;
+
+ smc = smc_sk(sk);
+
+ /* cleanup for a dangling non-blocking connect */
+ if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+ flush_work(&smc->connect_work);
+
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ rc = __smc_release(smc);
+
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
@@ -964,26 +974,7 @@ void smc_close_non_accepted(struct sock *sk)
if (!sk->sk_lingertime)
/* wait for peer closing */
sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
- if (!smc->use_fallback) {
- smc_close_active(smc);
- sock_set_flag(sk, SOCK_DEAD);
- sk->sk_shutdown |= SHUTDOWN_MASK;
- }
- sk->sk_prot->unhash(sk);
- if (smc->clcsock) {
- struct socket *tcp;
-
- tcp = smc->clcsock;
- smc->clcsock = NULL;
- sock_release(tcp);
- }
- if (smc->use_fallback) {
- sock_put(sk); /* passive closing */
- sk->sk_state = SMC_CLOSED;
- } else {
- if (sk->sk_state == SMC_CLOSED)
- smc_conn_free(&smc->conn);
- }
+ __smc_release(smc);
release_sock(sk);
sock_put(sk); /* final sock_put */
}
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 745afd82f281..49bcebff6378 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
struct smc_clc_msg_proposal_prefix *prop)
{
struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return -ENODEV;
- for_ifa(in_dev) {
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (!inet_ifa_match(ipv4, ifa))
continue;
prop->prefix_len = inet_mask_len(ifa->ifa_mask);
prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
/* prop->ipv6_prefixes_cnt = 0; already done by memset before */
return 0;
- } endfor_ifa(in_dev);
+ }
return -ENOENT;
}
@@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev,
struct smc_clc_msg_proposal_prefix *prop)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return -ENODEV;
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
inet_ifa_match(prop->outgoing_subnet, ifa))
return 0;
- } endfor_ifa(in_dev);
+ }
return -ENOENT;
}
diff --git a/net/socket.c b/net/socket.c
index 38eec1583f6d..963df5dbdd54 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -429,7 +429,7 @@ static int sock_map_fd(struct socket *sock, int flags)
}
newfile = sock_alloc_file(sock, flags, NULL);
- if (likely(!IS_ERR(newfile))) {
+ if (!IS_ERR(newfile)) {
fd_install(fd, newfile);
return fd;
}
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index f64f36a83063..b3815c1e8f2e 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -157,18 +157,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
return 0;
}
- skb = alloc_skb(0, GFP_ATOMIC);
+ skb = alloc_skb_for_msg(head);
if (!skb) {
STRP_STATS_INCR(strp->stats.mem_fail);
desc->error = -ENOMEM;
return 0;
}
- skb->len = head->len;
- skb->data_len = head->len;
- skb->truesize = head->truesize;
- *_strp_msg(skb) = *_strp_msg(head);
+
strp->skb_nextp = &head->next;
- skb_shinfo(skb)->frag_list = head;
strp->skb_head = skb;
head = skb;
} else {
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 6c997d4a6218..1336f3cdad38 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -323,7 +323,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
hdr = buf_msg(skb);
if (msg_user(hdr) == MSG_FRAGMENTER)
- hdr = msg_get_wrapped(hdr);
+ hdr = msg_inner_hdr(hdr);
if (msg_type(hdr) != TIPC_MCAST_MSG)
return 0;
@@ -392,7 +392,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
skb = skb_peek(pkts);
hdr = buf_msg(skb);
if (msg_user(hdr) == MSG_FRAGMENTER)
- hdr = msg_get_wrapped(hdr);
+ hdr = msg_inner_hdr(hdr);
msg_set_is_rcast(hdr, method->rcast);
/* Switch method ? */
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2050fd386642..f8bf63befe1f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -107,7 +107,6 @@ struct tipc_stats {
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
* @prev_from: sequence number of most previous retransmission request
- * @stale_cnt: counter for number of identical retransmit attempts
* @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
* @acked: # last packet acked by a certain peer. Used for broadcast.
@@ -167,7 +166,6 @@ struct tipc_link {
u16 snd_nxt;
u16 prev_from;
u16 window;
- u16 stale_cnt;
unsigned long stale_limit;
/* Reception */
@@ -249,9 +247,9 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data);
-static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
- struct tipc_gap_ack_blks *ga,
- struct sk_buff_head *xmitq);
+static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+ struct tipc_gap_ack_blks *ga,
+ struct sk_buff_head *xmitq);
/*
* Simple non-static link routines (i.e. referenced outside this file)
@@ -734,7 +732,7 @@ static void link_profile_stats(struct tipc_link *l)
if (msg_user(msg) == MSG_FRAGMENTER) {
if (msg_type(msg) != FIRST_FRAGMENT)
return;
- length = msg_size(msg_get_wrapped(msg));
+ length = msg_size(msg_inner_hdr(msg));
}
l->stats.msg_lengths_total += length;
l->stats.msg_length_counts++;
@@ -910,7 +908,6 @@ void tipc_link_reset(struct tipc_link *l)
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stale_cnt = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
@@ -1044,32 +1041,68 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
l->snd_nxt = seqno;
}
-static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
+/**
+ * link_retransmit_failure() - Detect repeated retransmit failures
+ * @l: tipc link sender
+ * @r: tipc link receiver (= l in case of unicast)
+ * @from: seqno of the 1st packet in retransmit request
+ * @rc: returned code
+ *
+ * Return: true if the repeated retransmit failures happens, otherwise
+ * false
+ */
+static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
+ u16 from, int *rc)
{
- struct tipc_msg *hdr = buf_msg(skb);
+ struct sk_buff *skb = skb_peek(&l->transmq);
+ struct tipc_msg *hdr;
+
+ if (!skb)
+ return false;
+ hdr = buf_msg(skb);
+
+ /* Detect repeated retransmit failures on same packet */
+ if (r->prev_from != from) {
+ r->prev_from = from;
+ r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
+ } else if (time_after(jiffies, r->stale_limit)) {
+ pr_warn("Retransmission failure on link <%s>\n", l->name);
+ link_print(l, "State of link ");
+ pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+ msg_user(hdr), msg_type(hdr), msg_size(hdr),
+ msg_errcode(hdr));
+ pr_info("sqno %u, prev: %x, src: %x\n",
+ msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
+
+ trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
+ trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
+ trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
+
+ if (link_is_bc_sndlink(l))
+ *rc = TIPC_LINK_DOWN_EVT;
+
+ *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ return true;
+ }
- pr_warn("Retransmission failure on link <%s>\n", l->name);
- link_print(l, "State of link ");
- pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
- msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
- pr_info("sqno %u, prev: %x, src: %x\n",
- msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
+ return false;
}
-/* tipc_link_retrans() - retransmit one or more packets
+/* tipc_link_bc_retrans() - retransmit zero or more packets
* @l: the link to transmit on
* @r: the receiving link ordering the retransmit. Same as l if unicast
* @from: retransmit from (inclusive) this sequence number
* @to: retransmit to (inclusive) this sequence number
* xmitq: queue for accumulating the retransmitted packets
*/
-static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
- u16 from, u16 to, struct sk_buff_head *xmitq)
+static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
+ u16 from, u16 to, struct sk_buff_head *xmitq)
{
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
struct tipc_msg *hdr;
+ int rc = 0;
if (!skb)
return 0;
@@ -1077,20 +1110,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
return 0;
trace_tipc_link_retrans(r, from, to, &l->transmq);
- /* Detect repeated retransmit failures on same packet */
- if (r->prev_from != from) {
- r->prev_from = from;
- r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
- r->stale_cnt = 0;
- } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
- link_retransmit_failure(l, skb);
- trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
- trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
- trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
- if (link_is_bc_sndlink(l))
- return TIPC_LINK_DOWN_EVT;
- return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
- }
+
+ if (link_retransmit_failure(l, r, from, &rc))
+ return rc;
skb_queue_walk(&l->transmq, skb) {
hdr = buf_msg(skb);
@@ -1103,7 +1125,7 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
continue;
TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
}
- _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+ _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
hdr = buf_msg(_skb);
@@ -1324,17 +1346,23 @@ exit:
* @gap: # of gap packets
* @ga: buffer pointer to Gap ACK blocks from peer
* @xmitq: queue for accumulating the retransmitted packets if any
+ *
+ * In case of a repeated retransmit failures, the call will return shortly
+ * with a returned code (e.g. TIPC_LINK_DOWN_EVT)
*/
-static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
- struct tipc_gap_ack_blks *ga,
- struct sk_buff_head *xmitq)
+static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+ struct tipc_gap_ack_blks *ga,
+ struct sk_buff_head *xmitq)
{
struct sk_buff *skb, *_skb, *tmp;
struct tipc_msg *hdr;
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
- u16 seqno;
- u16 n = 0;
+ u16 seqno, n = 0;
+ int rc = 0;
+
+ if (gap && link_retransmit_failure(l, l, acked + 1, &rc))
+ return rc;
skb_queue_walk_safe(&l->transmq, skb, tmp) {
seqno = buf_seqno(skb);
@@ -1369,6 +1397,8 @@ next_gap_ack:
goto next_gap_ack;
}
}
+
+ return 0;
}
/* tipc_link_build_state_msg: prepare link state message for transmission
@@ -1481,7 +1511,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Forward queues and wake up waiting users */
if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
- l->stale_cnt = 0;
tipc_link_advance_backlog(l, xmitq);
if (unlikely(!skb_queue_empty(&l->wakeupq)))
link_prepare_wakeup(l);
@@ -1918,7 +1947,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
rcvgap, 0, 0, xmitq);
- tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
+ rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
/* If NACK, retransmit will now start at right position */
if (gap)
@@ -2035,7 +2064,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
- rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
+ rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq);
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
@@ -2131,7 +2160,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
if (dnode == tipc_own_addr(l->net)) {
tipc_link_bc_ack_rcv(l, acked, xmitq);
- rc = tipc_link_retrans(l->bc_sndlink, l, from, to, xmitq);
+ rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq);
l->stats.recv_nacks++;
return rc;
}
@@ -2550,7 +2579,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf)
i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
i += scnprintf(buf + i, sz - i, " %u", l->prev_from);
- i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", 0);
i += scnprintf(buf + i, sz - i, " %u", l->acked);
list = &l->transmq;
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 8de02ad6e352..da509f0eb9ca 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -308,7 +308,7 @@ static inline unchar *msg_data(struct tipc_msg *m)
return ((unchar *)m) + msg_hdr_sz(m);
}
-static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
+static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m)
{
return (struct tipc_msg *)msg_data(m);
}
@@ -486,7 +486,7 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a)
static inline u32 msg_origport(struct tipc_msg *m)
{
if (msg_user(m) == MSG_FRAGMENTER)
- m = msg_get_wrapped(m);
+ m = msg_inner_hdr(m);
return msg_word(m, 4);
}
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 99bd166bccec..d6165ad384c0 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -261,7 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = {
.version = TIPC_GENL_V2_VERSION,
.hdrsize = 0,
.maxattr = TIPC_NLA_MAX,
- .policy = tipc_nl_policy,
+ .policy = tipc_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
.ops = tipc_genl_v2_ops,
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index cf155061c472..d86030ef1232 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -691,7 +691,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
struct nlattr *prop;
struct nlattr *media;
struct tipc_link_config *lc;
- int len;
lc = (struct tipc_link_config *)TLV_DATA(msg->req);
@@ -699,10 +698,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
if (!media)
return -EMSGSIZE;
- len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
- if (!string_is_valid(lc->name, len))
- return -EINVAL;
-
if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name))
return -EMSGSIZE;
@@ -723,7 +718,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
struct nlattr *prop;
struct nlattr *bearer;
struct tipc_link_config *lc;
- int len;
lc = (struct tipc_link_config *)TLV_DATA(msg->req);
@@ -731,10 +725,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
if (!bearer)
return -EMSGSIZE;
- len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
- if (!string_is_valid(lc->name, len))
- return -EINVAL;
-
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name))
return -EMSGSIZE;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 550581d47d51..324a1f91b394 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1649,7 +1649,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int usr = msg_user(hdr);
int mtyp = msg_type(hdr);
u16 oseqno = msg_seqno(hdr);
- u16 iseqno = msg_seqno(msg_get_wrapped(hdr));
+ u16 iseqno = msg_seqno(msg_inner_hdr(hdr));
u16 exp_pkts = msg_msgcnt(hdr);
u16 rcv_nxt, syncpt, dlv_nxt, inputq_len;
int state = n->state;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 1f9cf57d9754..40076f423dcb 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -209,6 +209,29 @@ void tls_device_free_resources_tx(struct sock *sk)
tls_free_partial_record(sk, tls_ctx);
}
+static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
+ u32 seq)
+{
+ struct net_device *netdev;
+ struct sk_buff *skb;
+ u8 *rcd_sn;
+
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ TCP_SKB_CB(skb)->eor = 1;
+
+ rcd_sn = tls_ctx->tx.rec_seq;
+
+ down_read(&device_offload_lock);
+ netdev = tls_ctx->netdev;
+ if (netdev)
+ netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ up_read(&device_offload_lock);
+
+ clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
+}
+
static void tls_append_frag(struct tls_record_info *record,
struct page_frag *pfrag,
int size)
@@ -252,7 +275,7 @@ static int tls_push_record(struct sock *sk,
skb_frag_address(frag),
record->len - prot->prepend_size,
record_type,
- ctx->crypto_send.info.version);
+ prot->version);
/* HW doesn't care about the data in the tag, because it fills it. */
dummy_tag_frag.page = skb_frag_page(frag);
@@ -264,7 +287,11 @@ static int tls_push_record(struct sock *sk,
list_add_tail(&record->list, &offload_ctx->records_list);
spin_unlock_irq(&offload_ctx->lock);
offload_ctx->open_record = NULL;
- tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version);
+
+ if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
+ tls_device_resync_tx(sk, ctx, tp->write_seq);
+
+ tls_advance_record_sn(sk, prot, &ctx->tx);
for (i = 0; i < record->num_frags; i++) {
frag = &record->frags[i];
@@ -551,7 +578,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
}
static void tls_device_resync_rx(struct tls_context *tls_ctx,
- struct sock *sk, u32 seq, u64 rcd_sn)
+ struct sock *sk, u32 seq, u8 *rcd_sn)
{
struct net_device *netdev;
@@ -559,14 +586,17 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
return;
netdev = READ_ONCE(tls_ctx->netdev);
if (netdev)
- netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn);
+ netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
+ TLS_OFFLOAD_CTX_DIR_RX);
clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
}
-void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_rx *rx_ctx;
+ u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+ struct tls_prot_info *prot;
u32 is_req_pending;
s64 resync_req;
u32 req_seq;
@@ -574,15 +604,83 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
if (tls_ctx->rx_conf != TLS_HW)
return;
+ prot = &tls_ctx->prot_info;
rx_ctx = tls_offload_ctx_rx(tls_ctx);
- resync_req = atomic64_read(&rx_ctx->resync_req);
- req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
- is_req_pending = resync_req;
+ memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
- if (unlikely(is_req_pending) && req_seq == seq &&
- atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+ switch (rx_ctx->resync_type) {
+ case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ:
+ resync_req = atomic64_read(&rx_ctx->resync_req);
+ req_seq = resync_req >> 32;
seq += TLS_HEADER_SIZE - 1;
- tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+ is_req_pending = resync_req;
+
+ if (likely(!is_req_pending) || req_seq != seq ||
+ !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+ return;
+ break;
+ case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT:
+ if (likely(!rx_ctx->resync_nh_do_now))
+ return;
+
+ /* head of next rec is already in, note that the sock_inq will
+ * include the currently parsed message when called from parser
+ */
+ if (tcp_inq(sk) > rcd_len)
+ return;
+
+ rx_ctx->resync_nh_do_now = 0;
+ seq += rcd_len;
+ tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+ break;
+ }
+
+ tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+}
+
+static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
+ struct tls_offload_context_rx *ctx,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct strp_msg *rxm;
+
+ /* device will request resyncs by itself based on stream scan */
+ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT)
+ return;
+ /* already scheduled */
+ if (ctx->resync_nh_do_now)
+ return;
+ /* seen decrypted fragments since last fully-failed record */
+ if (ctx->resync_nh_reset) {
+ ctx->resync_nh_reset = 0;
+ ctx->resync_nh.decrypted_failed = 1;
+ ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL;
+ return;
+ }
+
+ if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt)
+ return;
+
+ /* doing resync, bump the next target in case it fails */
+ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL)
+ ctx->resync_nh.decrypted_tgt *= 2;
+ else
+ ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL;
+
+ rxm = strp_msg(skb);
+
+ /* head of next rec is already in, parser will sync for us */
+ if (tcp_inq(sk) > rxm->full_len) {
+ ctx->resync_nh_do_now = 1;
+ } else {
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+
+ memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
+ tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+
+ tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq,
+ rcd_sn);
}
}
@@ -610,8 +708,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
sg_set_buf(&sg[0], buf,
rxm->full_len + TLS_HEADER_SIZE +
TLS_CIPHER_AES_GCM_128_IV_SIZE);
- skb_copy_bits(skb, offset, buf,
- TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ err = skb_copy_bits(skb, offset, buf,
+ TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ if (err)
+ goto free_buf;
/* We are interested only in the decrypted data not the auth */
err = decrypt_skb(sk, skb, sg);
@@ -625,8 +725,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
if (skb_pagelen(skb) > offset) {
copy = min_t(int, skb_pagelen(skb) - offset, data_len);
- if (skb->decrypted)
- skb_store_bits(skb, offset, buf, copy);
+ if (skb->decrypted) {
+ err = skb_store_bits(skb, offset, buf, copy);
+ if (err)
+ goto free_buf;
+ }
offset += copy;
buf += copy;
@@ -649,8 +752,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
copy = min_t(int, skb_iter->len - frag_pos,
data_len + rxm->offset - offset);
- if (skb_iter->decrypted)
- skb_store_bits(skb_iter, frag_pos, buf, copy);
+ if (skb_iter->decrypted) {
+ err = skb_store_bits(skb_iter, frag_pos, buf, copy);
+ if (err)
+ goto free_buf;
+ }
offset += copy;
buf += copy;
@@ -671,10 +777,6 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
int is_encrypted = !is_decrypted;
struct sk_buff *skb_iter;
- /* Skip if it is already decrypted */
- if (ctx->sw.decrypted)
- return 0;
-
/* Check if all the data is decrypted already */
skb_walk_frags(skb, skb_iter) {
is_decrypted &= skb_iter->decrypted;
@@ -683,12 +785,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
ctx->sw.decrypted |= is_decrypted;
- /* Return immedeatly if the record is either entirely plaintext or
+ /* Return immediately if the record is either entirely plaintext or
* entirely ciphertext. Otherwise handle reencrypt partially decrypted
* record.
*/
- return (is_encrypted || is_decrypted) ? 0 :
- tls_device_reencrypt(sk, skb);
+ if (is_decrypted) {
+ ctx->resync_nh_reset = 1;
+ return 0;
+ }
+ if (is_encrypted) {
+ tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb);
+ return 0;
+ }
+
+ ctx->resync_nh_reset = 1;
+ return tls_device_reencrypt(sk, skb);
}
static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
@@ -757,6 +868,12 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto free_offload_ctx;
}
+ /* Sanity-check the rec_seq_size for stack allocations */
+ if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
+ rc = -EINVAL;
+ goto free_offload_ctx;
+ }
+
prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
prot->tag_size = tag_size;
prot->overhead_size = prot->prepend_size + prot->tag_size;
@@ -908,6 +1025,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
rc = -ENOMEM;
goto release_netdev;
}
+ context->resync_nh_reset = 1;
ctx->priv_ctx_rx = context;
rc = tls_set_sw_offload(sk, ctx, 0);
@@ -1015,7 +1133,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
case NETDEV_REGISTER:
case NETDEV_FEAT_CHANGE:
if ((dev->features & NETIF_F_HW_TLS_RX) &&
- !dev->tlsdev_ops->tls_dev_resync_rx)
+ !dev->tlsdev_ops->tls_dev_resync)
return NOTIFY_BAD;
if (dev->tlsdev_ops &&
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index c3a5fe624b4e..1d2d804ac633 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -240,7 +240,6 @@ static int fill_sg_in(struct scatterlist *sg_in,
record = tls_get_record(ctx, tcp_seq, rcd_sn);
if (!record) {
spin_unlock_irqrestore(&ctx->lock, flags);
- WARN(1, "Record not found for seq %u\n", tcp_seq);
return -EINVAL;
}
@@ -409,7 +408,10 @@ put_sg:
put_page(sg_page(&sg_in[--resync_sgs]));
kfree(sg_in);
free_orig:
- kfree_skb(skb);
+ if (nskb)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return nskb;
}
@@ -424,6 +426,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
}
EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
+struct sk_buff *tls_encrypt_skb(struct sk_buff *skb)
+{
+ return tls_sw_fallback(skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(tls_encrypt_skb);
+
int tls_sw_fallback_init(struct sock *sk,
struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 455a782c7658..db585964b52b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk,
/* Unhook the record from context if encryption is not failure */
ctx->open_rec = NULL;
- tls_advance_record_sn(sk, &tls_ctx->tx, prot->version);
+ tls_advance_record_sn(sk, prot, &tls_ctx->tx);
return rc;
}
@@ -1485,15 +1485,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct tls_prot_info *prot = &tls_ctx->prot_info;
- int version = prot->version;
struct strp_msg *rxm = strp_msg(skb);
int pad, err = 0;
if (!ctx->decrypted) {
#ifdef CONFIG_TLS_DEVICE
- err = tls_device_decrypted(sk, skb);
- if (err < 0)
- return err;
+ if (tls_ctx->rx_conf == TLS_HW) {
+ err = tls_device_decrypted(sk, skb);
+ if (err < 0)
+ return err;
+ }
#endif
/* Still not decrypted after tls_device */
if (!ctx->decrypted) {
@@ -1501,8 +1502,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
async);
if (err < 0) {
if (err == -EINPROGRESS)
- tls_advance_record_sn(sk, &tls_ctx->rx,
- version);
+ tls_advance_record_sn(sk, prot,
+ &tls_ctx->rx);
return err;
}
@@ -1517,7 +1518,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
rxm->full_len -= pad;
rxm->offset += prot->prepend_size;
rxm->full_len -= prot->overhead_size;
- tls_advance_record_sn(sk, &tls_ctx->rx, version);
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
ctx->decrypted = true;
ctx->saved_data_ready(sk);
} else {
@@ -2013,8 +2014,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
goto read_failure;
}
#ifdef CONFIG_TLS_DEVICE
- handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
- *(u64*)tls_ctx->rx.rec_seq);
+ tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
+ TCP_SKB_CB(skb)->seq + rxm->offset);
#endif
return data_len + TLS_HEADER_SIZE;
@@ -2281,8 +2282,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto free_priv;
}
- /* Sanity-check the IV size for stack allocations. */
- if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
+ /* Sanity-check the sizes for stack allocations. */
+ if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
+ rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
rc = -EINVAL;
goto free_priv;
}
diff --git a/net/unix/diag.c b/net/unix/diag.c
index c51a707260fa..9ff64f9df1f3 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -5,9 +5,11 @@
#include <linux/unix_diag.h>
#include <linux/skbuff.h>
#include <linux/module.h>
+#include <linux/uidgid.h>
#include <net/netlink.h>
#include <net/af_unix.h>
#include <net/tcp_states.h>
+#include <net/sock.h>
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
{
@@ -111,6 +113,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
}
+static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb)
+{
+ uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk));
+ return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);
+}
+
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
u32 portid, u32 seq, u32 flags, int sk_ino)
{
@@ -157,6 +165,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
goto out_nlmsg_trim;
+ if ((req->udiag_show & UDIAG_SHOW_UID) &&
+ sk_diag_dump_uid(sk, skb))
+ goto out_nlmsg_trim;
+
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 169112f8aa1e..ab47bf3ab66e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -274,7 +274,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected);
void vsock_remove_bound(struct vsock_sock *vsk)
{
spin_lock_bh(&vsock_table_lock);
- __vsock_remove_bound(vsk);
+ if (__vsock_in_bound_table(vsk))
+ __vsock_remove_bound(vsk);
spin_unlock_bh(&vsock_table_lock);
}
EXPORT_SYMBOL_GPL(vsock_remove_bound);
@@ -282,7 +283,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound);
void vsock_remove_connected(struct vsock_sock *vsk)
{
spin_lock_bh(&vsock_table_lock);
- __vsock_remove_connected(vsk);
+ if (__vsock_in_connected_table(vsk))
+ __vsock_remove_connected(vsk);
spin_unlock_bh(&vsock_table_lock);
}
EXPORT_SYMBOL_GPL(vsock_remove_connected);
@@ -318,35 +320,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
}
EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
-static bool vsock_in_bound_table(struct vsock_sock *vsk)
-{
- bool ret;
-
- spin_lock_bh(&vsock_table_lock);
- ret = __vsock_in_bound_table(vsk);
- spin_unlock_bh(&vsock_table_lock);
-
- return ret;
-}
-
-static bool vsock_in_connected_table(struct vsock_sock *vsk)
-{
- bool ret;
-
- spin_lock_bh(&vsock_table_lock);
- ret = __vsock_in_connected_table(vsk);
- spin_unlock_bh(&vsock_table_lock);
-
- return ret;
-}
-
void vsock_remove_sock(struct vsock_sock *vsk)
{
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
+ vsock_remove_bound(vsk);
+ vsock_remove_connected(vsk);
}
EXPORT_SYMBOL_GPL(vsock_remove_sock);
@@ -477,8 +454,7 @@ static void vsock_pending_work(struct work_struct *work)
* incoming packets can't find this socket, and to reduce the reference
* count.
*/
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
+ vsock_remove_connected(vsk);
sk->sk_state = TCP_CLOSE;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 62dcdf082349..f2084e3f7aa4 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -14,14 +14,14 @@
#include <net/sock.h>
#include <net/af_vsock.h>
-/* The host side's design of the feature requires 6 exact 4KB pages for
- * recv/send rings respectively -- this is suboptimal considering memory
- * consumption, however unluckily we have to live with it, before the
- * host comes up with a better design in the future.
+/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer
+ * hosts don't have this limitation; but, keep the defaults the same for compat.
*/
#define PAGE_SIZE_4K 4096
#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
+#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64)
/* The MTU is 16KB per the host side's design */
#define HVS_MTU_SIZE (1024 * 16)
@@ -46,8 +46,9 @@ struct hvs_recv_buf {
};
/* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
- * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated
- * buffer, because tests show there is no significant performance difference.
+ * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
+ * guest and the host processing as one VMBUS packet is the smallest processing
+ * unit.
*
* Note: the buffer can be eliminated in the future when we add new VMBus
* ringbuffer APIs that allow us to directly copy data from userspace buffer
@@ -321,8 +322,11 @@ static void hvs_open_connection(struct vmbus_channel *chan)
struct sockaddr_vm addr;
struct sock *sk, *new = NULL;
struct vsock_sock *vnew = NULL;
- struct hvsock *hvs, *hvs_new = NULL;
+ struct hvsock *hvs = NULL;
+ struct hvsock *hvs_new = NULL;
+ int rcvbuf;
int ret;
+ int sndbuf;
if_type = &chan->offermsg.offer.if_type;
if_instance = &chan->offermsg.offer.if_instance;
@@ -364,9 +368,34 @@ static void hvs_open_connection(struct vmbus_channel *chan)
}
set_channel_read_mode(chan, HV_CALL_DIRECT);
- ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE,
- RINGBUFFER_HVS_RCV_SIZE, NULL, 0,
- hvs_channel_cb, conn_from_host ? new : sk);
+
+ /* Use the socket buffer sizes as hints for the VMBUS ring size. For
+ * server side sockets, 'sk' is the parent socket and thus, this will
+ * allow the child sockets to inherit the size from the parent. Keep
+ * the mins to the default value and align to page size as per VMBUS
+ * requirements.
+ * For the max, the socket core library will limit the socket buffer
+ * size that can be set by the user, but, since currently, the hv_sock
+ * VMBUS ring buffer is physically contiguous allocation, restrict it
+ * further.
+ * Older versions of hv_sock host side code cannot handle bigger VMBUS
+ * ring buffer size. Use the version number to limit the change to newer
+ * versions.
+ */
+ if (vmbus_proto_version < VERSION_WIN10_V5) {
+ sndbuf = RINGBUFFER_HVS_SND_SIZE;
+ rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
+ } else {
+ sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
+ sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
+ sndbuf = ALIGN(sndbuf, PAGE_SIZE);
+ rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
+ rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
+ rcvbuf = ALIGN(rcvbuf, PAGE_SIZE);
+ }
+
+ ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
+ conn_from_host ? new : sk);
if (ret != 0) {
if (conn_from_host) {
hvs_new->chan = NULL;
@@ -424,6 +453,7 @@ static u32 hvs_get_local_cid(void)
static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
{
struct hvsock *hvs;
+ struct sock *sk = sk_vsock(vsk);
hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
if (!hvs)
@@ -431,7 +461,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
vsk->trans = hvs;
hvs->vsk = vsk;
-
+ sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
+ sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
return 0;
}
@@ -627,7 +658,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
struct hvsock *hvs = vsk->trans;
struct vmbus_channel *chan = hvs->chan;
struct hvs_send_buf *send_buf;
- ssize_t to_write, max_writable, ret;
+ ssize_t to_write, max_writable;
+ ssize_t ret = 0;
+ ssize_t bytes_written = 0;
BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
@@ -635,20 +668,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
if (!send_buf)
return -ENOMEM;
- max_writable = hvs_channel_writable_bytes(chan);
- to_write = min_t(ssize_t, len, max_writable);
- to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
-
- ret = memcpy_from_msg(send_buf->data, msg, to_write);
- if (ret < 0)
- goto out;
+ /* Reader(s) could be draining data from the channel as we write.
+ * Maximize bandwidth, by iterating until the channel is found to be
+ * full.
+ */
+ while (len) {
+ max_writable = hvs_channel_writable_bytes(chan);
+ if (!max_writable)
+ break;
+ to_write = min_t(ssize_t, len, max_writable);
+ to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
+ /* memcpy_from_msg is safe for loop as it advances the offsets
+ * within the message iterator.
+ */
+ ret = memcpy_from_msg(send_buf->data, msg, to_write);
+ if (ret < 0)
+ goto out;
- ret = hvs_send_data(hvs->chan, send_buf, to_write);
- if (ret < 0)
- goto out;
+ ret = hvs_send_data(hvs->chan, send_buf, to_write);
+ if (ret < 0)
+ goto out;
- ret = to_write;
+ bytes_written += to_write;
+ len -= to_write;
+ }
out:
+ /* If any data has been sent, return that */
+ if (bytes_written)
+ ret = bytes_written;
kfree(send_buf);
return ret;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 53ad3dbb76fe..45d9afcff6d5 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy)
return -EINVAL;
}
+ for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
+ /*
+ * Validate we have a policy (can be explicitly set to
+ * VENDOR_CMD_RAW_DATA which is non-NULL) and also that
+ * we have at least one of doit/dumpit.
+ */
+ if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy))
+ return -EINVAL;
+ if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit &&
+ !rdev->wiphy.vendor_commands[i].dumpit))
+ return -EINVAL;
+ }
+
#ifdef CONFIG_PM
if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns &&
(!rdev->wiphy.wowlan->pattern_min_len ||
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 84d36ca7a7ab..ee8388fe4a92 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
+struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *tmp,
+ bool signal_valid, unsigned long ts);
#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
#define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond)
#else
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 520d437aa8d1..fc83dd179c1a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -571,6 +571,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_PEER_MEASUREMENTS] =
NLA_POLICY_NESTED(nl80211_pmsr_attr_policy),
[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
+ .len = SAE_PASSWORD_MAX_LEN },
+ [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -4447,6 +4450,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
return true;
case NL80211_CMD_CONNECT:
if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
auth_type == NL80211_AUTHTYPE_SAE)
return false;
@@ -4637,6 +4642,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(params.acl);
}
+ params.twt_responder =
+ nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);
+
nl80211_calculate_ap_params(&params);
if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
@@ -8751,7 +8759,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
static bool nl80211_valid_wpa_versions(u32 wpa_versions)
{
return !(wpa_versions & ~(NL80211_WPA_VERSION_1 |
- NL80211_WPA_VERSION_2));
+ NL80211_WPA_VERSION_2 |
+ NL80211_WPA_VERSION_3));
}
static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
@@ -8987,6 +8996,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
}
+ if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD))
+ return -EINVAL;
+ settings->sae_pwd =
+ nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+ settings->sae_pwd_len =
+ nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+ }
+
return 0;
}
@@ -12669,6 +12688,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb,
return 0;
}
+static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ if (vcmd->policy == VENDOR_CMD_RAW_DATA) {
+ if (attr->nla_type & NLA_F_NESTED) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "unexpected nested data");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ if (!(attr->nla_type & NLA_F_NESTED)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data");
+ return -EINVAL;
+ }
+
+ return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy,
+ extack);
+}
+
static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -12727,11 +12769,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_VENDOR_DATA]) {
data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]);
len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]);
+
+ err = nl80211_vendor_check_policy(vcmd,
+ info->attrs[NL80211_ATTR_VENDOR_DATA],
+ info->extack);
+ if (err)
+ return err;
}
rdev->cur_cmd_info = info;
- err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev,
- data, len);
+ err = vcmd->doit(&rdev->wiphy, wdev, data, len);
rdev->cur_cmd_info = NULL;
return err;
}
@@ -12818,6 +12865,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
+
+ err = nl80211_vendor_check_policy(
+ &(*rdev)->wiphy.vendor_commands[vcmd_idx],
+ attrbuf[NL80211_ATTR_VENDOR_DATA],
+ cb->extack);
+ if (err)
+ return err;
}
/* 0 is the first index - add 1 to parse only once */
@@ -15086,7 +15140,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
return;
}
- if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -15376,6 +15432,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
}
EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);
+void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan);
+ nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL,
+ rdev, wdev, cookie, chan, 0, gfp);
+}
+EXPORT_SYMBOL(cfg80211_tx_mgmt_expired);
+
void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
struct station_info *sinfo, gfp_t gfp)
{
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index aa571d727903..d66e6d4b7555 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss {
};
/* Returned bss is reference counted and must be cleaned up appropriately. */
-static struct cfg80211_internal_bss *
+struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
struct cfg80211_internal_bss *tmp,
- bool signal_valid)
+ bool signal_valid, unsigned long ts)
{
struct cfg80211_internal_bss *found = NULL;
if (WARN_ON(!tmp->pub.channel))
return NULL;
- tmp->ts = jiffies;
+ tmp->ts = ts;
spin_lock_bh(&rdev->bss_lock);
@@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
- res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+ jiffies);
if (!res)
return NULL;
@@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
- res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+ jiffies);
if (!res)
return NULL;
@@ -1972,6 +1974,27 @@ out:
}
EXPORT_SYMBOL(cfg80211_unlink_bss);
+void cfg80211_bss_iter(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ void (*iter)(struct wiphy *wiphy,
+ struct cfg80211_bss *bss,
+ void *data),
+ void *iter_data)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss;
+
+ spin_lock_bh(&rdev->bss_lock);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel))
+ iter(wiphy, &bss->pub, iter_data);
+ }
+
+ spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_bss_iter);
+
#ifdef CONFIG_CFG80211_WEXT
static struct cfg80211_registered_device *
cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 7d34cb884840..7a6c38ddc65a 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev,
u8 *next;
if (params->bss) {
- /* Make sure the bss entry provided by the driver is valid. */
struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss);
- if (WARN_ON(list_empty(&ibss->list))) {
- cfg80211_put_bss(wdev->wiphy, params->bss);
- return;
+ if (list_empty(&ibss->list)) {
+ struct cfg80211_bss *found = NULL, *tmp = params->bss;
+
+ found = cfg80211_get_bss(wdev->wiphy, NULL,
+ params->bss->bssid,
+ wdev->ssid, wdev->ssid_len,
+ wdev->conn_bss_type,
+ IEEE80211_PRIVACY_ANY);
+ if (found) {
+ /* The same BSS is already updated so use it
+ * instead, as it has latest info.
+ */
+ params->bss = found;
+ } else {
+ /* Update with BSS provided by driver, it will
+ * be freshly added and ref cnted, we can free
+ * the old one.
+ *
+ * signal_valid can be false, as we are not
+ * expecting the BSS to be found.
+ *
+ * keep the old timestamp to avoid confusion
+ */
+ cfg80211_bss_update(rdev, ibss, false,
+ ibss->ts);
+ }
+
+ cfg80211_put_bss(wdev->wiphy, tmp);
}
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2abfff925aac..4fbb91a511ae 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired,
WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);
+TRACE_EVENT(cfg80211_tx_mgmt_expired,
+ TP_PROTO(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan),
+ TP_ARGS(wdev, cookie, chan),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u64, cookie)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ CHAN_ASSIGN(chan);
+ ),
+ TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
+ WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
+);
+
TRACE_EVENT(cfg80211_new_sta,
TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
struct station_info *sinfo),