summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/debugfs.c13
-rw-r--r--net/8021q/vlan.c101
-rw-r--r--net/8021q/vlan.h12
-rw-r--r--net/8021q/vlan_core.c128
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/Kconfig4
-rw-r--r--net/batman-adv/Kconfig10
-rw-r--r--net/batman-adv/bat_iv_ogm.c25
-rw-r--r--net/batman-adv/bat_v.c26
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c82
-rw-r--r--net/batman-adv/debugfs.c2
-rw-r--r--net/batman-adv/distributed-arp-table.c42
-rw-r--r--net/batman-adv/gateway_client.c3
-rw-r--r--net/batman-adv/hard-interface.c3
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h6
-rw-r--r--net/batman-adv/log.c60
-rw-r--r--net/batman-adv/main.c3
-rw-r--r--net/batman-adv/main.h3
-rw-r--r--net/batman-adv/multicast.c51
-rw-r--r--net/batman-adv/netlink.c24
-rw-r--r--net/batman-adv/trace.c2
-rw-r--r--net/batman-adv/trace.h6
-rw-r--r--net/batman-adv/translation-table.c41
-rw-r--r--net/batman-adv/types.h5
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/hci_event.c6
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/l2cap_core.c12
-rw-r--r--net/bluetooth/rfcomm/core.c12
-rw-r--r--net/bluetooth/rfcomm/sock.c12
-rw-r--r--net/bluetooth/sco.c12
-rw-r--r--net/bluetooth/smp.c8
-rw-r--r--net/bpf/test_run.c36
-rw-r--r--net/bridge/br.c89
-rw-r--r--net/bridge/br_device.c11
-rw-r--r--net/bridge/br_fdb.c46
-rw-r--r--net/bridge/br_if.c23
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_mdb.c126
-rw-r--r--net/bridge/br_multicast.c442
-rw-r--r--net/bridge/br_netfilter_hooks.c54
-rw-r--r--net/bridge/br_netfilter_ipv6.c4
-rw-r--r--net/bridge/br_netlink.c71
-rw-r--r--net/bridge/br_private.h81
-rw-r--r--net/bridge/br_switchdev.c5
-rw-r--r--net/bridge/br_sysfs_br.c36
-rw-r--r--net/bridge/br_sysfs_if.c3
-rw-r--r--net/bridge/br_vlan.c71
-rw-r--r--net/can/raw.c2
-rw-r--r--net/compat.c34
-rw-r--r--net/core/datagram.c204
-rw-r--r--net/core/dev.c217
-rw-r--r--net/core/dev_addr_lists.c100
-rw-r--r--net/core/dev_ioctl.c4
-rw-r--r--net/core/devlink.c5
-rw-r--r--net/core/filter.c485
-rw-r--r--net/core/flow_dissector.c9
-rw-r--r--net/core/gro_cells.c1
-rw-r--r--net/core/neighbour.c455
-rw-r--r--net/core/net-sysfs.c2
-rw-r--r--net/core/net_namespace.c162
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/rtnetlink.c520
-rw-r--r--net/core/skbuff.c313
-rw-r--r--net/core/skmsg.c28
-rw-r--r--net/core/sock.c14
-rw-r--r--net/core/sock_reuseport.c1
-rw-r--r--net/core/stream.c2
-rw-r--r--net/core/sysctl_net_core.c20
-rw-r--r--net/dccp/ipv4.c13
-rw-r--r--net/dccp/ipv6.c13
-rw-r--r--net/dccp/proto.c7
-rw-r--r--net/decnet/af_decnet.c4
-rw-r--r--net/dsa/Kconfig4
-rw-r--r--net/dsa/dsa.c8
-rw-r--r--net/dsa/dsa_priv.h2
-rw-r--r--net/dsa/master.c63
-rw-r--r--net/dsa/port.c3
-rw-r--r--net/dsa/slave.c86
-rw-r--r--net/dsa/tag_brcm.c2
-rw-r--r--net/dsa/tag_dsa.c1
-rw-r--r--net/dsa/tag_edsa.c1
-rw-r--r--net/dsa/tag_gswip.c1
-rw-r--r--net/dsa/tag_ksz.c117
-rw-r--r--net/dsa/tag_lan9303.c1
-rw-r--r--net/dsa/tag_mtk.c1
-rw-r--r--net/dsa/tag_qca.c1
-rw-r--r--net/dsa/tag_trailer.c1
-rw-r--r--net/ethernet/eth.c56
-rw-r--r--net/ieee802154/6lowpan/tx.c3
-rw-r--r--net/ieee802154/nl-phy.c2
-rw-r--r--net/ipv4/af_inet.c17
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/esp4.c9
-rw-r--r--net/ipv4/esp4_offload.c15
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fou.c75
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_hashtables.c117
-rw-r--r--net/ipv4/ip_forward.c8
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c56
-rw-r--r--net/ipv4/ip_input.c77
-rw-r--r--net/ipv4/ip_output.c42
-rw-r--r--net/ipv4/ip_tunnel_core.c3
-rw-r--r--net/ipv4/ipconfig.c21
-rw-r--r--net/ipv4/ipip.c14
-rw-r--r--net/ipv4/ipmr.c19
-rw-r--r--net/ipv4/metrics.c26
-rw-r--r--net/ipv4/netfilter/Kconfig5
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c184
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c7
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c43
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c38
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c150
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c83
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c4
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c33
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c17
-rw-r--r--net/ipv4/tcp_bbr.c15
-rw-r--r--net/ipv4/tcp_bpf.c32
-rw-r--r--net/ipv4/tcp_input.c94
-rw-r--r--net/ipv4/tcp_ipv4.c132
-rw-r--r--net/ipv4/tcp_offload.c6
-rw-r--r--net/ipv4/tcp_output.c79
-rw-r--r--net/ipv4/tcp_timer.c20
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c266
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udp_offload.c122
-rw-r--r--net/ipv4/udp_tunnel.c18
-rw-r--r--net/ipv4/udplite.c4
-rw-r--r--net/ipv4/xfrm4_protocol.c18
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/anycast.c6
-rw-r--r--net/ipv6/datagram.c10
-rw-r--r--net/ipv6/esp6.c9
-rw-r--r--net/ipv6/esp6_offload.c15
-rw-r--r--net/ipv6/fou6.c74
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/inet6_hashtables.c58
-rw-r--r--net/ipv6/ip6_gre.c24
-rw-r--r--net/ipv6/ip6_input.c67
-rw-r--r--net/ipv6/ip6_offload.c48
-rw-r--r--net/ipv6/ip6_output.c92
-rw-r--r--net/ipv6/ip6_tunnel.c1
-rw-r--r--net/ipv6/ip6_udp_tunnel.c19
-rw-r--r--net/ipv6/ip6_vti.c1
-rw-r--r--net/ipv6/ip6mr.c13
-rw-r--r--net/ipv6/ipv6_sockglue.c2
-rw-r--r--net/ipv6/netfilter.c3
-rw-r--r--net/ipv6/netfilter/Makefile2
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c8
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c8
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c43
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c49
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c90
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c10
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c4
-rw-r--r--net/ipv6/raw.c7
-rw-r--r--net/ipv6/reassembly.c9
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/seg6_iptunnel.c1
-rw-r--r--net/ipv6/tcp_ipv6.c16
-rw-r--r--net/ipv6/tcpv6_offload.c7
-rw-r--r--net/ipv6/tunnel6.c12
-rw-r--r--net/ipv6/udp.c320
-rw-r--r--net/ipv6/udp_impl.h4
-rw-r--r--net/ipv6/udp_offload.c13
-rw-r--r--net/ipv6/udplite.c5
-rw-r--r--net/ipv6/xfrm6_input.c8
-rw-r--r--net/ipv6/xfrm6_policy.c1
-rw-r--r--net/ipv6/xfrm6_protocol.c18
-rw-r--r--net/ipv6/xfrm6_tunnel.c3
-rw-r--r--net/iucv/af_iucv.c41
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/l3mdev/l3mdev.c18
-rw-r--r--net/mac80211/Kconfig11
-rw-r--r--net/mac80211/cfg.c36
-rw-r--r--net/mac80211/debugfs_netdev.c3
-rw-r--r--net/mac80211/debugfs_sta.c14
-rw-r--r--net/mac80211/driver-ops.h34
-rw-r--r--net/mac80211/ieee80211_i.h1
-rw-r--r--net/mac80211/iface.c15
-rw-r--r--net/mac80211/main.c6
-rw-r--r--net/mac80211/mesh.c8
-rw-r--r--net/mac80211/mesh.h3
-rw-r--r--net/mac80211/mesh_plink.c35
-rw-r--r--net/mac80211/mlme.c65
-rw-r--r--net/mac80211/rx.c42
-rw-r--r--net/mac80211/scan.c22
-rw-r--r--net/mac80211/sta_info.c11
-rw-r--r--net/mac80211/sta_info.h2
-rw-r--r--net/mac80211/status.c7
-rw-r--r--net/mac80211/trace.h18
-rw-r--r--net/mac80211/tx.c15
-rw-r--r--net/mac80211/util.c51
-rw-r--r--net/mac80211/wep.c4
-rw-r--r--net/ncsi/internal.h24
-rw-r--r--net/ncsi/ncsi-aen.c75
-rw-r--r--net/ncsi/ncsi-manage.c550
-rw-r--r--net/ncsi/ncsi-netlink.c233
-rw-r--r--net/ncsi/ncsi-pkt.h9
-rw-r--r--net/ncsi/ncsi-rsp.c43
-rw-r--r--net/netfilter/Kconfig15
-rw-r--r--net/netfilter/Makefile7
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c13
-rw-r--r--net/netfilter/ipset/ip_set_core.c170
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h4
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c27
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c10
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c3
-rw-r--r--net/netfilter/nf_conncount.c46
-rw-r--r--net/netfilter/nf_conntrack_acct.c89
-rw-r--r--net/netfilter/nf_conntrack_core.c28
-rw-r--r--net/netfilter/nf_conntrack_ecache.c66
-rw-r--r--net/netfilter/nf_conntrack_helper.c69
-rw-r--r--net/netfilter/nf_conntrack_netlink.c30
-rw-r--r--net/netfilter/nf_conntrack_proto.c21
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c56
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c18
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c7
-rw-r--r--net/netfilter/nf_conntrack_standalone.c103
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c70
-rw-r--r--net/netfilter/nf_flow_table_core.c42
-rw-r--r--net/netfilter/nf_log_common.c20
-rw-r--r--net/netfilter/nf_nat_core.c330
-rw-r--r--net/netfilter/nf_nat_proto.c343
-rw-r--r--net/netfilter/nf_nat_proto_common.c120
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c82
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c77
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c85
-rw-r--r--net/netfilter/nf_nat_proto_udp.c130
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/netfilter/nf_nat_sip.c39
-rw-r--r--net/netfilter/nf_queue.c50
-rw-r--r--net/netfilter/nf_tables_api.c157
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c15
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c28
-rw-r--r--net/netfilter/nft_compat.c3
-rw-r--r--net/netfilter/nft_flow_offload.c5
-rw-r--r--net/netfilter/nft_meta.c2
-rw-r--r--net/netfilter/nft_xfrm.c2
-rw-r--r--net/netfilter/xt_RATEEST.c10
-rw-r--r--net/netfilter/xt_hashlimit.c13
-rw-r--r--net/netfilter/xt_physdev.c2
-rw-r--r--net/netfilter/xt_policy.c2
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/openvswitch/actions.c13
-rw-r--r--net/openvswitch/conntrack.c2
-rw-r--r--net/openvswitch/flow.c6
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_netlink.c22
-rw-r--r--net/openvswitch/vport-geneve.c2
-rw-r--r--net/openvswitch/vport-gre.c2
-rw-r--r--net/openvswitch/vport-netdev.c1
-rw-r--r--net/openvswitch/vport-vxlan.c2
-rw-r--r--net/packet/af_packet.c14
-rw-r--r--net/rds/message.c24
-rw-r--r--net/rds/rdma.c75
-rw-r--r--net/rds/rds.h23
-rw-r--r--net/rds/send.c61
-rw-r--r--net/rfkill/rfkill-gpio.c1
-rw-r--r--net/sched/act_api.c221
-rw-r--r--net/sched/act_police.c24
-rw-r--r--net/sched/act_tunnel_key.c25
-rw-r--r--net/sched/act_vlan.c2
-rw-r--r--net/sched/cls_api.c337
-rw-r--r--net/sched/cls_bpf.c4
-rw-r--r--net/sched/cls_flower.c198
-rw-r--r--net/sched/cls_matchall.c5
-rw-r--r--net/sched/cls_u32.c10
-rw-r--r--net/sched/sch_api.c101
-rw-r--r--net/sched/sch_etf.c79
-rw-r--r--net/sched/sch_fq.c28
-rw-r--r--net/sched/sch_generic.c8
-rw-r--r--net/sched/sch_gred.c375
-rw-r--r--net/sched/sch_mq.c18
-rw-r--r--net/sched/sch_netem.c92
-rw-r--r--net/sched/sch_prio.c47
-rw-r--r--net/sched/sch_red.c48
-rw-r--r--net/sctp/associola.c11
-rw-r--r--net/sctp/bind_addr.c28
-rw-r--r--net/sctp/chunk.c14
-rw-r--r--net/sctp/input.c134
-rw-r--r--net/sctp/ipv6.c8
-rw-r--r--net/sctp/output.c1
-rw-r--r--net/sctp/primitive.c2
-rw-r--r--net/sctp/sm_make_chunk.c3
-rw-r--r--net/sctp/sm_sideeffect.c12
-rw-r--r--net/sctp/sm_statetable.c2
-rw-r--r--net/sctp/socket.c177
-rw-r--r--net/sctp/stream_interleave.c46
-rw-r--r--net/sctp/ulpqueue.c8
-rw-r--r--net/smc/af_smc.c65
-rw-r--r--net/smc/smc.h4
-rw-r--r--net/smc/smc_clc.c33
-rw-r--r--net/smc/smc_clc.h3
-rw-r--r--net/smc/smc_core.c16
-rw-r--r--net/smc/smc_core.h6
-rw-r--r--net/smc/smc_llc.c57
-rw-r--r--net/smc/smc_llc.h2
-rw-r--r--net/socket.c62
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c4
-rw-r--r--net/sunrpc/clnt.c9
-rw-r--r--net/sunrpc/socklib.c2
-rw-r--r--net/sunrpc/xprt.c48
-rw-r--r--net/sunrpc/xprtsock.c91
-rw-r--r--net/switchdev/switchdev.c213
-rw-r--r--net/tipc/Makefile4
-rw-r--r--net/tipc/bearer.c9
-rw-r--r--net/tipc/bearer.h2
-rw-r--r--net/tipc/link.c220
-rw-r--r--net/tipc/link.h2
-rw-r--r--net/tipc/msg.h1
-rw-r--r--net/tipc/netlink_compat.c7
-rw-r--r--net/tipc/node.c103
-rw-r--r--net/tipc/node.h1
-rw-r--r--net/tipc/socket.c267
-rw-r--r--net/tipc/socket.h4
-rw-r--r--net/tipc/sysctl.c8
-rw-r--r--net/tipc/trace.c206
-rw-r--r--net/tipc/trace.h431
-rw-r--r--net/tipc/udp_media.c9
-rw-r--r--net/tls/tls_main.c58
-rw-r--r--net/tls/tls_sw.c64
-rw-r--r--net/vmw_vsock/af_vsock.c7
-rw-r--r--net/vmw_vsock/vmci_transport.c67
-rw-r--r--net/wireless/Makefile1
-rw-r--r--net/wireless/chan.c3
-rw-r--r--net/wireless/core.c48
-rw-r--r--net/wireless/core.h5
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c2
-rw-r--r--net/wireless/lib80211_crypt_tkip.c4
-rw-r--r--net/wireless/lib80211_crypt_wep.c4
-rw-r--r--net/wireless/mlme.c4
-rw-r--r--net/wireless/nl80211.c312
-rw-r--r--net/wireless/nl80211.h32
-rw-r--r--net/wireless/pmsr.c590
-rw-r--r--net/wireless/rdev-ops.h25
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/wireless/sme.c8
-rw-r--r--net/wireless/trace.h92
-rw-r--r--net/wireless/util.c17
-rw-r--r--net/x25/af_x25.c18
-rw-r--r--net/x25/x25_in.c9
-rw-r--r--net/xdp/xsk.c16
-rw-r--r--net/xfrm/Kconfig1
-rw-r--r--net/xfrm/xfrm_device.c4
-rw-r--r--net/xfrm/xfrm_input.c83
-rw-r--r--net/xfrm/xfrm_interface.c2
-rw-r--r--net/xfrm/xfrm_output.c8
-rw-r--r--net/xfrm/xfrm_policy.c1270
-rw-r--r--net/xfrm/xfrm_state.c10
-rw-r--r--net/xfrm/xfrm_user.c4
369 files changed, 12808 insertions, 5914 deletions
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index 24915e0bb9ea..6c152f9ea26e 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -232,18 +232,7 @@ static int lowpan_context_show(struct seq_file *file, void *offset)
return 0;
}
-
-static int lowpan_context_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lowpan_context_show, inode->i_private);
-}
-
-static const struct file_operations lowpan_context_fops = {
- .open = lowpan_context_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(lowpan_context);
static int lowpan_short_addr_get(void *data, u64 *val)
{
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5e9950453955..dc4411165e43 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -330,6 +330,7 @@ static void vlan_transfer_features(struct net_device *dev,
vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
+ vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev);
netdev_update_features(vlandev);
}
@@ -357,6 +358,7 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event)
static int vlan_device_event(struct notifier_block *unused, unsigned long event,
void *ptr)
{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct vlan_group *grp;
struct vlan_info *vlan_info;
@@ -459,7 +461,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
vlan = vlan_dev_priv(vlandev);
if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
- dev_change_flags(vlandev, flgs | IFF_UP);
+ dev_change_flags(vlandev, flgs | IFF_UP,
+ extack);
netif_stacked_transfer_operstate(dev, vlandev);
}
break;
@@ -647,93 +650,6 @@ out:
return err;
}
-static struct sk_buff *vlan_gro_receive(struct list_head *head,
- struct sk_buff *skb)
-{
- const struct packet_offload *ptype;
- unsigned int hlen, off_vlan;
- struct sk_buff *pp = NULL;
- struct vlan_hdr *vhdr;
- struct sk_buff *p;
- __be16 type;
- int flush = 1;
-
- off_vlan = skb_gro_offset(skb);
- hlen = off_vlan + sizeof(*vhdr);
- vhdr = skb_gro_header_fast(skb, off_vlan);
- if (skb_gro_header_hard(skb, hlen)) {
- vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
- if (unlikely(!vhdr))
- goto out;
- }
-
- type = vhdr->h_vlan_encapsulated_proto;
-
- rcu_read_lock();
- ptype = gro_find_receive_by_type(type);
- if (!ptype)
- goto out_unlock;
-
- flush = 0;
-
- list_for_each_entry(p, head, list) {
- struct vlan_hdr *vhdr2;
-
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
- if (compare_vlan_header(vhdr, vhdr2))
- NAPI_GRO_CB(p)->same_flow = 0;
- }
-
- skb_gro_pull(skb, sizeof(*vhdr));
- skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
- pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
-
-out_unlock:
- rcu_read_unlock();
-out:
- skb_gro_flush_final(skb, pp, flush);
-
- return pp;
-}
-
-static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
-{
- struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
- __be16 type = vhdr->h_vlan_encapsulated_proto;
- struct packet_offload *ptype;
- int err = -ENOENT;
-
- rcu_read_lock();
- ptype = gro_find_complete_by_type(type);
- if (ptype)
- err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
-
- rcu_read_unlock();
- return err;
-}
-
-static struct packet_offload vlan_packet_offloads[] __read_mostly = {
- {
- .type = cpu_to_be16(ETH_P_8021Q),
- .priority = 10,
- .callbacks = {
- .gro_receive = vlan_gro_receive,
- .gro_complete = vlan_gro_complete,
- },
- },
- {
- .type = cpu_to_be16(ETH_P_8021AD),
- .priority = 10,
- .callbacks = {
- .gro_receive = vlan_gro_receive,
- .gro_complete = vlan_gro_complete,
- },
- },
-};
-
static int __net_init vlan_init_net(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
@@ -761,7 +677,6 @@ static struct pernet_operations vlan_net_ops = {
static int __init vlan_proto_init(void)
{
int err;
- unsigned int i;
pr_info("%s v%s\n", vlan_fullname, vlan_version);
@@ -785,9 +700,6 @@ static int __init vlan_proto_init(void)
if (err < 0)
goto err5;
- for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
- dev_add_offload(&vlan_packet_offloads[i]);
-
vlan_ioctl_set(vlan_ioctl_handler);
return 0;
@@ -805,13 +717,8 @@ err0:
static void __exit vlan_cleanup_module(void)
{
- unsigned int i;
-
vlan_ioctl_set(NULL);
- for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
- dev_remove_offload(&vlan_packet_offloads[i]);
-
vlan_netlink_fini();
unregister_netdevice_notifier(&vlan_notifier_block);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 44df1c3df02d..c46daf09a501 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -92,6 +92,18 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
return NULL;
}
+static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
+{
+ netdev_features_t ret;
+
+ ret = real_dev->hw_enc_features &
+ (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL);
+
+ if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
+ return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
+ return 0;
+}
+
#define vlan_group_for_each_dev(grp, i, dev) \
for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 4f60e86f4b8d..a313165e7a67 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -57,7 +57,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
}
skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
@@ -223,6 +223,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi
return -ENODEV;
}
+int vlan_for_each(struct net_device *dev,
+ int (*action)(struct net_device *dev, int vid, void *arg),
+ void *arg)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ struct net_device *vdev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
+ vid_info->vid);
+ ret = action(vdev, vid_info->vid, arg);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(vlan_for_each);
+
int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
struct net_device *real_dev = vlan_info->real_dev;
@@ -426,3 +453,102 @@ bool vlan_uses_dev(const struct net_device *dev)
return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);
+
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
+ __be16 type;
+ int flush = 1;
+
+ off_vlan = skb_gro_offset(skb);
+ hlen = off_vlan + sizeof(*vhdr);
+ vhdr = skb_gro_header_fast(skb, off_vlan);
+ if (skb_gro_header_hard(skb, hlen)) {
+ vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
+ if (unlikely(!vhdr))
+ goto out;
+ }
+
+ type = vhdr->h_vlan_encapsulated_proto;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out_unlock;
+
+ flush = 0;
+
+ list_for_each_entry(p, head, list) {
+ struct vlan_hdr *vhdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+ if (compare_vlan_header(vhdr, vhdr2))
+ NAPI_GRO_CB(p)->same_flow = 0;
+ }
+
+ skb_gro_pull(skb, sizeof(*vhdr));
+ skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+ __be16 type = vhdr->h_vlan_encapsulated_proto;
+ struct packet_offload *ptype;
+ int err = -ENOENT;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
+
+ rcu_read_unlock();
+ return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+ {
+ .type = cpu_to_be16(ETH_P_8021Q),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+ {
+ .type = cpu_to_be16(ETH_P_8021AD),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+};
+
+static int __init vlan_offload_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+ dev_add_offload(&vlan_packet_offloads[i]);
+
+ return 0;
+}
+
+fs_initcall(vlan_offload_init);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ff720f1ebf73..b2d9c8f27cd7 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -562,6 +562,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL |
NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
NETIF_F_ALL_FCOE;
@@ -572,6 +573,7 @@ static int vlan_dev_init(struct net_device *dev)
netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
+ dev->hw_enc_features = vlan_tnl_features(real_dev);
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
diff --git a/net/Kconfig b/net/Kconfig
index f235edb593ba..5cb9de1aaf88 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -51,6 +51,9 @@ config NET_INGRESS
config NET_EGRESS
bool
+config SKB_EXTENSIONS
+ bool
+
menu "Networking options"
source "net/packet/Kconfig"
@@ -184,6 +187,7 @@ config BRIDGE_NETFILTER
depends on NETFILTER && INET
depends on NETFILTER_ADVANCED
select NETFILTER_FAMILY_BRIDGE
+ select SKB_EXTENSIONS
default m
---help---
Enabling this option will let arptables resp. iptables see bridged
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index f75816f58107..c386e6981416 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -22,7 +22,6 @@
config BATMAN_ADV
tristate "B.A.T.M.A.N. Advanced Meshing Protocol"
depends on NET
- select CRC16
select LIBCRC32C
help
B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
@@ -48,6 +47,7 @@ config BATMAN_ADV_BATMAN_V
config BATMAN_ADV_BLA
bool "Bridge Loop Avoidance"
depends on BATMAN_ADV && INET
+ select CRC16
default y
help
This option enables BLA (Bridge Loop Avoidance), a mechanism
@@ -82,6 +82,7 @@ config BATMAN_ADV_NC
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
+ default y
help
This option enables the multicast optimisation which aims to
reduce the air overhead while improving the reliability of
@@ -100,12 +101,13 @@ config BATMAN_ADV_DEBUGFS
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
- depends on BATMAN_ADV_DEBUGFS
+ depends on BATMAN_ADV
help
This is an option for use by developers; most people should
say N here. This enables compilation of support for
- outputting debugging information to the kernel log. The
- output is controlled via the module parameter debug.
+ outputting debugging information to the debugfs log or tracing
+ buffer. The output is controlled via the batadv netdev specific
+ log_level setting.
config BATMAN_ADV_TRACING
bool "B.A.T.M.A.N. tracing support"
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index d2227091029f..f97e566f0402 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -34,7 +34,6 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
-#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/pkt_sched.h>
@@ -2585,13 +2584,14 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
* batadv_iv_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @bat_priv: The bat priv with all the soft interface information
* @gw_node: Gateway to be dumped
*
* Return: Error code, or 0 on success
*/
-static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_gw_node *gw_node)
{
@@ -2611,13 +2611,16 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
ret = -EMSGSIZE;
if (curr_gw == gw_node)
@@ -2668,13 +2671,15 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx_skip = cb->args[0];
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
- if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, gw_node)) {
+ if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
idx_skip = idx - 1;
goto unlock;
}
@@ -2682,7 +2687,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
idx_skip = idx;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&bat_priv->gw.list_lock);
cb->args[0] = idx_skip;
}
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 6baec4e68898..90e33f84d37a 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -27,11 +27,13 @@
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
+#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -915,13 +917,14 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv,
* batadv_v_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @bat_priv: The bat priv with all the soft interface information
* @gw_node: Gateway to be dumped
*
* Return: Error code, or 0 on success
*/
-static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_gw_node *gw_node)
{
@@ -941,13 +944,16 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
ret = -EMSGSIZE;
if (curr_gw == gw_node) {
@@ -1018,13 +1024,15 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx_skip = cb->args[0];
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
- if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, gw_node)) {
+ if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
idx_skip = idx - 1;
goto unlock;
}
@@ -1032,7 +1040,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
idx_skip = idx;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&bat_priv->gw.list_lock);
cb->args[0] = idx_skip;
}
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 5f1aeeded0e3..5fdde2947802 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -2094,14 +2094,15 @@ out:
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
* @claim: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
struct batadv_bla_claim *claim)
{
@@ -2111,13 +2112,16 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
void *hdr;
int ret = -EINVAL;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_CLAIM);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
is_own = batadv_compare_eth(claim->backbone_gw->orig,
primary_addr);
@@ -2153,28 +2157,33 @@ out:
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
- * @head: bucket to dump
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: always 0.
*/
static int
-batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
- struct hlist_head *head, int *idx_skip)
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
{
struct batadv_bla_claim *claim;
int idx = 0;
int ret = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_skip)
continue;
- ret = batadv_bla_claim_dump_entry(msg, portid, seq,
+ ret = batadv_bla_claim_dump_entry(msg, portid, cb,
primary_if, claim);
if (ret) {
*idx_skip = idx - 1;
@@ -2184,7 +2193,7 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*idx_skip = 0;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return ret;
}
@@ -2204,7 +2213,6 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
int ifindex;
int ret = 0;
@@ -2230,11 +2238,8 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_bla_claim_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq,
- primary_if, head, &idx))
+ if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
break;
bucket++;
}
@@ -2325,14 +2330,15 @@ out:
* netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
* @backbone_gw: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
struct batadv_bla_backbone_gw *backbone_gw)
{
@@ -2343,13 +2349,16 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
void *hdr;
int ret = -EINVAL;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_BACKBONE);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
spin_lock_bh(&backbone_gw->crc_lock);
@@ -2386,28 +2395,33 @@ out:
* a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
- * @head: bucket to dump
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: always 0.
*/
static int
-batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
- struct hlist_head *head, int *idx_skip)
+ struct batadv_hashtable *hash,
+ unsigned int bucket, int *idx_skip)
{
struct batadv_bla_backbone_gw *backbone_gw;
int idx = 0;
int ret = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_skip)
continue;
- ret = batadv_bla_backbone_dump_entry(msg, portid, seq,
+ ret = batadv_bla_backbone_dump_entry(msg, portid, cb,
primary_if, backbone_gw);
if (ret) {
*idx_skip = idx - 1;
@@ -2417,7 +2431,7 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*idx_skip = 0;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return ret;
}
@@ -2437,7 +2451,6 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
int ifindex;
int ret = 0;
@@ -2463,11 +2476,8 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_bla_backbone_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq,
- primary_if, head, &idx))
+ if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
break;
bucket++;
}
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 8b608a2e2653..d4a7702e48d8 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -19,6 +19,7 @@
#include "debugfs.h"
#include "main.h"
+#include <asm/current.h>
#include <linux/dcache.h>
#include <linux/debugfs.h>
#include <linux/err.h>
@@ -27,6 +28,7 @@
#include <linux/fs.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
+#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/stddef.h>
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index a60bacf7120b..b9ffe1826527 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -863,23 +863,27 @@ out:
* netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @dat_entry: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_dat_entry *dat_entry)
{
int msecs;
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_DAT_CACHE);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
msecs = jiffies_to_msecs(jiffies - dat_entry->last_update);
if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS,
@@ -901,27 +905,31 @@ batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
- * @head: bucket to dump
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: 0 or error code.
*/
static int
-batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
- struct hlist_head *head, int *idx_skip)
+batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
{
struct batadv_dat_entry *dat_entry;
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) {
if (idx < *idx_skip)
goto skip;
- if (batadv_dat_cache_dump_entry(msg, portid, seq,
- dat_entry)) {
- rcu_read_unlock();
+ if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_skip = idx;
return -EMSGSIZE;
@@ -930,7 +938,7 @@ batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
skip:
idx++;
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return 0;
}
@@ -951,7 +959,6 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
int ifindex;
int ret = 0;
@@ -977,10 +984,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_dat_cache_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq, head,
+ if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket,
&idx))
break;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 140c61a3f1ec..9d8e5eda2314 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -377,6 +377,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
kref_get(&gw_node->refcount);
hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
+ bat_priv->gw.generation++;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n",
@@ -472,6 +473,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
if (!hlist_unhashed(&gw_node->list)) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
}
spin_unlock_bh(&bat_priv->gw.list_lock);
@@ -518,6 +520,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
&bat_priv->gw.gateway_list, list) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
}
spin_unlock_bh(&bat_priv->gw.list_lock);
}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 781c5b6e6e8e..508f4416dfc9 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -951,6 +951,7 @@ batadv_hardif_add_interface(struct net_device *net_dev)
batadv_check_known_mac_addr(hard_iface->net_dev);
kref_get(&hard_iface->refcount);
list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
+ batadv_hardif_generation++;
return hard_iface;
@@ -993,6 +994,7 @@ void batadv_hardif_remove_interfaces(void)
list_for_each_entry_safe(hard_iface, hard_iface_tmp,
&batadv_hardif_list, list) {
list_del_rcu(&hard_iface->list);
+ batadv_hardif_generation++;
batadv_hardif_remove_interface(hard_iface);
}
rtnl_unlock();
@@ -1054,6 +1056,7 @@ static int batadv_hard_if_event(struct notifier_block *this,
case NETDEV_UNREGISTER:
case NETDEV_PRE_TYPE_CHANGE:
list_del_rcu(&hard_iface->list);
+ batadv_hardif_generation++;
batadv_hardif_remove_interface(hard_iface);
break;
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 7b49e4001778..9194f4d891b1 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -32,6 +32,8 @@ static void batadv_hash_init(struct batadv_hashtable *hash)
INIT_HLIST_HEAD(&hash->table[i]);
spin_lock_init(&hash->list_locks[i]);
}
+
+ atomic_set(&hash->generation, 0);
}
/**
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 9490a7ca2ba6..0e36fa1c7c3e 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -21,6 +21,7 @@
#include "main.h"
+#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/list.h>
#include <linux/rculist.h>
@@ -58,6 +59,9 @@ struct batadv_hashtable {
/** @size: size of hashtable */
u32 size;
+
+ /** @generation: current (generation) sequence number */
+ atomic_t generation;
};
/* allocates and clears the hash */
@@ -112,6 +116,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash,
/* no duplicate found in list, add new element */
hlist_add_head_rcu(data_node, head);
+ atomic_inc(&hash->generation);
ret = 0;
@@ -154,6 +159,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
data_save = node;
hlist_del_rcu(node);
+ atomic_inc(&hash->generation);
break;
}
spin_unlock_bh(&hash->list_locks[index]);
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 6beb5f067810..02e55b78132f 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -43,6 +43,8 @@
#include "debugfs.h"
#include "trace.h"
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
@@ -92,33 +94,6 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
return 0;
}
-/**
- * batadv_debug_log() - Add debug log entry
- * @bat_priv: the bat priv with all the soft interface information
- * @fmt: format string
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV",
- jiffies_to_msecs(jiffies), &vaf);
-
- trace_batadv_dbg(bat_priv, &vaf);
-
- va_end(args);
-
- return 0;
-}
-
static int batadv_log_open(struct inode *inode, struct file *file)
{
if (!try_module_get(THIS_MODULE))
@@ -259,3 +234,34 @@ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
kfree(bat_priv->debug_log);
bat_priv->debug_log = NULL;
}
+
+#endif /* CONFIG_BATMAN_ADV_DEBUGFS */
+
+/**
+ * batadv_debug_log() - Add debug log entry
+ * @bat_priv: the bat priv with all the soft interface information
+ * @fmt: format string
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+ batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV",
+ jiffies_to_msecs(jiffies), &vaf);
+#endif
+
+ trace_batadv_dbg(bat_priv, &vaf);
+
+ va_end(args);
+
+ return 0;
+}
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 69c0d85bceb3..d1ed839fd32b 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -74,6 +74,7 @@
* list traversals just rcu-locked
*/
struct list_head batadv_hardif_list;
+unsigned int batadv_hardif_generation;
static int (*batadv_rx_handler[256])(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
@@ -186,6 +187,8 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
INIT_HLIST_HEAD(&bat_priv->tp_list);
+ bat_priv->gw.generation = 0;
+
ret = batadv_v_mesh_init(bat_priv);
if (ret < 0)
goto err;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 2002b70e18db..b572066325e4 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.4"
+#define BATADV_SOURCE_VERSION "2019.0"
#endif
/* B.A.T.M.A.N. parameters */
@@ -247,6 +247,7 @@ static inline int batadv_print_vid(unsigned short vid)
}
extern struct list_head batadv_hardif_list;
+extern unsigned int batadv_hardif_generation;
extern unsigned char batadv_broadcast_addr[];
extern struct workqueue_struct *batadv_event_workqueue;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 86725d792e15..69244e4598f5 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1365,22 +1365,26 @@ int batadv_mcast_mesh_info_put(struct sk_buff *msg,
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @orig_node: originator to dump the multicast flags of
*
* Return: 0 or error code.
*/
static int
-batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_orig_node *orig_node)
{
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_MCAST_FLAGS);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
orig_node->orig)) {
genlmsg_cancel(msg, hdr);
@@ -1405,21 +1409,26 @@ batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* table to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
- * @head: bucket to dump
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: 0 or error code.
*/
static int
-batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
- struct hlist_head *head, long *idx_skip)
+batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash,
+ unsigned int bucket, long *idx_skip)
{
struct batadv_orig_node *orig_node;
long idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) {
if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
&orig_node->capa_initialized))
continue;
@@ -1427,9 +1436,8 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
if (idx < *idx_skip)
goto skip;
- if (batadv_mcast_flags_dump_entry(msg, portid, seq,
- orig_node)) {
- rcu_read_unlock();
+ if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_skip = idx;
return -EMSGSIZE;
@@ -1438,7 +1446,7 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
skip:
idx++;
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return 0;
}
@@ -1447,7 +1455,7 @@ skip:
* __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @bat_priv: the bat priv with all the soft interface information
* @bucket: current bucket to dump
* @idx: index in current bucket to the next entry to dump
@@ -1455,19 +1463,17 @@ skip:
* Return: 0 or error code.
*/
static int
-__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq,
+__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv, long *bucket, long *idx)
{
struct batadv_hashtable *hash = bat_priv->orig_hash;
long bucket_tmp = *bucket;
- struct hlist_head *head;
long idx_tmp = *idx;
while (bucket_tmp < hash->size) {
- head = &hash->table[bucket_tmp];
-
- if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head,
- &idx_tmp))
+ if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash,
+ *bucket, &idx_tmp))
break;
bucket_tmp++;
@@ -1550,8 +1556,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb)
return ret;
bat_priv = netdev_priv(primary_if->soft_iface);
- ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, bucket, idx);
+ ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx);
batadv_hardif_put(primary_if);
return ret;
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 0d9459b69bdb..2dc3304cee54 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -29,11 +29,11 @@
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/printk.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
@@ -445,23 +445,27 @@ out:
* batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @hard_iface: Hard interface to dump
*
* Return: error code, or 0 on success
*/
static int
-batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *hard_iface)
{
struct net_device *net_dev = hard_iface->net_dev;
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
BATADV_CMD_GET_HARDIFS);
if (!hdr)
return -EMSGSIZE;
+ genl_dump_check_consistent(cb, hdr);
+
if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
net_dev->ifindex) ||
nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
@@ -498,7 +502,6 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
struct batadv_hard_iface *hard_iface;
int ifindex;
int portid = NETLINK_CB(cb->skb).portid;
- int seq = cb->nlh->nlmsg_seq;
int skip = cb->args[0];
int i = 0;
@@ -516,23 +519,24 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
return -ENODEV;
}
- rcu_read_lock();
+ rtnl_lock();
+ cb->seq = batadv_hardif_generation << 1 | 1;
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->soft_iface != soft_iface)
continue;
if (i++ < skip)
continue;
- if (batadv_netlink_dump_hardif_entry(msg, portid, seq,
+ if (batadv_netlink_dump_hardif_entry(msg, portid, cb,
hard_iface)) {
i--;
break;
}
}
- rcu_read_unlock();
+ rtnl_unlock();
dev_put(soft_iface);
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
index 3d57f9981f25..8e1024217cff 100644
--- a/net/batman-adv/trace.c
+++ b/net/batman-adv/trace.c
@@ -16,7 +16,5 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include <linux/module.h>
-
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index 3acda26a30ca..104784be94d7 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -21,7 +21,13 @@
#include "main.h"
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
#include <linux/tracepoint.h>
+#include <linux/types.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM batadv
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index d21624c44665..8dcd4968cde7 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1145,14 +1145,15 @@ out:
* batadv_tt_local_dump_entry() - Dump one TT local entry into a message
* @msg :Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @bat_priv: The bat priv with all the soft interface information
* @common: tt local & tt global common data
*
* Return: Error code, or 0 on success
*/
static int
-batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_tt_common_entry *common)
{
@@ -1173,12 +1174,14 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
batadv_softif_vlan_put(vlan);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI,
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
BATADV_CMD_GET_TRANSTABLE_LOCAL);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
@@ -1201,34 +1204,39 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @bat_priv: The bat priv with all the soft interface information
- * @head: Pointer to the list containing the local tt entries
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_s: Number of entries to skip
*
* Return: Error code, or 0 on success
*/
static int
-batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
- struct hlist_head *head, int *idx_s)
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_s)
{
struct batadv_tt_common_entry *common;
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(common, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(common, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_s)
continue;
- if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv,
+ if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv,
common)) {
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_s = idx - 1;
return -EMSGSIZE;
}
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_s = 0;
return 0;
@@ -1248,7 +1256,6 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_hashtable *hash;
- struct hlist_head *head;
int ret;
int ifindex;
int bucket = cb->args[0];
@@ -1276,10 +1283,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
hash = bat_priv->tt.local_hash;
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, head, &idx))
+ if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv,
+ hash, bucket, &idx))
break;
bucket++;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 45b5592de816..cbe17da36fcb 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1096,12 +1096,15 @@ struct batadv_priv_gw {
/** @gateway_list: list of available gateway nodes */
struct hlist_head gateway_list;
- /** @list_lock: lock protecting gateway_list & curr_gw */
+ /** @list_lock: lock protecting gateway_list, curr_gw, generation */
spinlock_t list_lock;
/** @curr_gw: pointer to currently selected gateway node */
struct batadv_gw_node __rcu *curr_gw;
+ /** @generation: current (generation) sequence number */
+ unsigned int generation;
+
/**
* @mode: gateway operation: off, client or server (see batadv_gw_modes)
*/
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 828e87fe8027..9d79c7de234a 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -607,7 +607,7 @@ static void ifup(struct net_device *netdev)
int err;
rtnl_lock();
- err = dev_open(netdev);
+ err = dev_open(netdev, NULL);
if (err < 0)
BT_INFO("iface %s cannot be opened (%d)", netdev->name, err);
rtnl_unlock();
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ef9928d7b4fb..ac2826ce162b 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5711,6 +5711,12 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
return true;
}
+ /* Check if request ended in Command Status - no way to retreive
+ * any extra parameters in this case.
+ */
+ if (hdr->evt == HCI_EV_CMD_STATUS)
+ return false;
+
if (hdr->evt != HCI_EV_CMD_COMPLETE) {
bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)",
hdr->evt);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e8c9ef1e1922..ca73d36cc149 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1556,7 +1556,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
mgmt_get_connectable(hdev);
- if (!is_advertising_allowed(hdev, connectable))
+ if (!is_advertising_allowed(hdev, connectable))
return -EPERM;
/* Set require_privacy to true only when non-connectable
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 2146e0f3b6f8..2a7fb517d460 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -7650,17 +7650,7 @@ static int l2cap_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int l2cap_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, l2cap_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations l2cap_debugfs_fops = {
- .open = l2cap_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(l2cap_debugfs);
static struct dentry *l2cap_debugfs;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index b98225d65e87..1a635df80643 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -2166,17 +2166,7 @@ static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
return 0;
}
-static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations rfcomm_dlc_debugfs_fops = {
- .open = rfcomm_dlc_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(rfcomm_dlc_debugfs);
static struct dentry *rfcomm_dlc_debugfs;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d606e9212291..aa0db1d1bd9b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1020,17 +1020,7 @@ static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations rfcomm_sock_debugfs_fops = {
- .open = rfcomm_sock_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(rfcomm_sock_debugfs);
static struct dentry *rfcomm_sock_debugfs;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 8f0f9279eac9..529b38996d8b 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1173,17 +1173,7 @@ static int sco_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int sco_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sco_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations sco_debugfs_fops = {
- .open = sco_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sco_debugfs);
static struct dentry *sco_debugfs;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index c822e626761b..621146d04c03 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -1390,7 +1390,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
if (!smp)
return NULL;
- smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(smp->tfm_aes)) {
BT_ERR("Unable to create AES crypto context");
goto zfree_smp;
@@ -3233,7 +3233,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (!smp)
return ERR_PTR(-ENOMEM);
- tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(tfm_aes)) {
BT_ERR("Unable to create AES crypto context");
kzfree(smp);
@@ -3906,13 +3906,13 @@ int __init bt_selftest_smp(void)
struct crypto_kpp *tfm_ecdh;
int err;
- tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(tfm_aes)) {
BT_ERR("Unable to create AES crypto context");
return PTR_ERR(tfm_aes);
}
- tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
crypto_free_cipher(tfm_aes);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c89c22c49015..fa2644d276ef 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
return ret;
}
-static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
+static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret,
+ u32 *time)
{
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
enum bpf_cgroup_storage_type stype;
u64 time_start, time_spent = 0;
- u32 ret = 0, i;
+ u32 i;
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
@@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
repeat = 1;
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
- ret = bpf_test_run_one(prog, ctx, storage);
+ *ret = bpf_test_run_one(prog, ctx, storage);
if (need_resched()) {
if (signal_pending(current))
break;
@@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
- return ret;
+ return 0;
}
static int bpf_test_finish(const union bpf_attr *kattr,
@@ -74,8 +75,18 @@ static int bpf_test_finish(const union bpf_attr *kattr,
{
void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
int err = -EFAULT;
+ u32 copy_size = size;
+
+ /* Clamp copy if the user has provided a size hint, but copy the full
+ * buffer if not to retain old behaviour.
+ */
+ if (kattr->test.data_size_out &&
+ copy_size > kattr->test.data_size_out) {
+ copy_size = kattr->test.data_size_out;
+ err = -ENOSPC;
+ }
- if (data_out && copy_to_user(data_out, data, size))
+ if (data_out && copy_to_user(data_out, data, copy_size))
goto out;
if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
goto out;
@@ -83,7 +94,8 @@ static int bpf_test_finish(const union bpf_attr *kattr,
goto out;
if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
goto out;
- err = 0;
+ if (err != -ENOSPC)
+ err = 0;
out:
return err;
}
@@ -165,7 +177,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
__skb_push(skb, hh_len);
if (is_direct_pkt_access)
bpf_compute_data_pointers(skb);
- retval = bpf_test_run(prog, skb, repeat, &duration);
+ ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
+ if (ret) {
+ kfree_skb(skb);
+ kfree(sk);
+ return ret;
+ }
if (!is_l2) {
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
@@ -212,11 +229,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
xdp.rxq = &rxqueue->xdp_rxq;
- retval = bpf_test_run(prog, &xdp, repeat, &duration);
+ ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration);
+ if (ret)
+ goto out;
if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
+out:
kfree(data);
return ret;
}
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 360ad66c21e9..a5174e5001d8 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -31,6 +31,8 @@
*/
static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
@@ -56,6 +58,17 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
br_mtu_auto_adjust(br);
break;
+ case NETDEV_PRE_CHANGEADDR:
+ if (br->dev->addr_assign_type == NET_ADDR_SET)
+ break;
+ prechaddr_info = ptr;
+ err = dev_pre_changeaddr_notify(br->dev,
+ prechaddr_info->dev_addr,
+ extack);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
case NETDEV_CHANGEADDR:
spin_lock_bh(&br->lock);
br_fdb_changeaddr(p, dev->dev_addr);
@@ -175,6 +188,82 @@ static struct notifier_block br_switchdev_notifier = {
.notifier_call = br_switchdev_event,
};
+/* br_boolopt_toggle - change user-controlled boolean option
+ *
+ * @br: bridge device
+ * @opt: id of the option to change
+ * @on: new option value
+ * @extack: extack for error messages
+ *
+ * Changes the value of the respective boolean option to @on taking care of
+ * any internal option value mapping and configuration.
+ */
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack)
+{
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
+ break;
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return 0;
+}
+
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
+{
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ return br_opt_get(br, BROPT_NO_LL_LEARN);
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return 0;
+}
+
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long bitmap = bm->optmask;
+ int err = 0;
+ int opt_id;
+
+ for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
+ bool on = !!(bm->optval & BIT(opt_id));
+
+ err = br_boolopt_toggle(br, opt_id, on, extack);
+ if (err) {
+ br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n",
+ opt_id, br_boolopt_get(br, opt_id), on, err);
+ break;
+ }
+ }
+
+ return err;
+}
+
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm)
+{
+ u32 optval = 0;
+ int opt_id;
+
+ for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++)
+ optval |= (br_boolopt_get(br, opt_id) << opt_id);
+
+ bm->optval = optval;
+ bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0);
+}
+
+/* private bridge options, controlled by the kernel */
void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
{
bool cur = !!br_opt_get(br, opt);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c6abf927f0c9..013323b6dbe4 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev)
return err;
}
+ err = br_mdb_hash_init(br);
+ if (err) {
+ free_percpu(br->stats);
+ br_fdb_hash_fini(br);
+ return err;
+ }
+
err = br_vlan_init(br);
if (err) {
free_percpu(br->stats);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
return err;
}
@@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev)
if (err) {
free_percpu(br->stats);
br_vlan_flush(br);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
}
br_set_lockdep_class(dev);
@@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev)
br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
free_percpu(br->stats);
}
@@ -393,6 +403,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
+ .ndo_fdb_get = br_fdb_get,
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e56ba3912a90..fe3c758791ca 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -773,6 +773,32 @@ skip:
return err;
}
+int br_fdb_get(struct sk_buff *skb,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_fdb_entry *f;
+ int err = 0;
+
+ rcu_read_lock();
+ f = br_fdb_find_rcu(br, addr, vid);
+ if (!f) {
+ NL_SET_ERR_MSG(extack, "Fdb entry not found");
+ err = -ENOENT;
+ goto errout;
+ }
+
+ err = fdb_fill_info(skb, br, f, portid, seq,
+ RTM_NEWNEIGH, 0);
+errout:
+ rcu_read_unlock();
+ return err;
+}
+
/* Update (create or replace) forwarding database entry */
static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
const u8 *addr, u16 state, u16 flags, u16 vid,
@@ -1164,3 +1190,23 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
spin_unlock_bh(&br->hash_lock);
}
+
+void br_fdb_clear_offload(const struct net_device *dev, u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return;
+
+ spin_lock_bh(&p->br->hash_lock);
+ hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) {
+ if (f->dst == p && f->key.vlan_id == vid)
+ f->offloaded = 0;
+ }
+ spin_unlock_bh(&p->br->hash_lock);
+}
+EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 9b46d2dc4c22..41f0a696a65f 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -650,7 +650,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
- err = nbp_vlan_init(p);
+ if (br->dev->addr_assign_type != NET_ADDR_SET) {
+ /* Ask for permission to use this MAC address now, even if we
+ * don't end up choosing it below.
+ */
+ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
+ if (err)
+ goto err7;
+ }
+
+ err = nbp_vlan_init(p, extack);
if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
goto err7;
@@ -741,3 +750,15 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
if (mask & BR_NEIGH_SUPPRESS)
br_recalculate_neigh_suppress_enabled(br);
}
+
+bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
+{
+ struct net_bridge_port *p;
+
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p)
+ return false;
+
+ return p->flags & flag;
+}
+EXPORT_SYMBOL_GPL(br_port_flag_is_set);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 3ddca11f44c2..5ea7e56119c1 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -188,7 +188,9 @@ static void __br_handle_local_finish(struct sk_buff *skb)
u16 vid = 0;
/* check if vlan is allowed, to avoid spoofing */
- if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
+ if ((p->flags & BR_LEARNING) &&
+ !br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
+ br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a7ea2d431714..f69c8d91dc81 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
+ int idx = 0, s_idx = cb->args[1], err = 0;
struct net_bridge *br = netdev_priv(dev);
- struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
- int i, err = 0;
- int idx = 0, s_idx = cb->args[1];
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
- mdb = rcu_dereference(br->mdb);
- if (!mdb)
- return 0;
-
nest = nla_nest_start(skb, MDBA_MDB);
if (nest == NULL)
return -EMSGSIZE;
- for (i = 0; i < mdb->max; i++) {
- struct net_bridge_mdb_entry *mp;
+ hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
struct net_bridge_port *port;
- hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) {
- if (idx < s_idx)
- goto skip;
+ if (idx < s_idx)
+ goto skip;
- nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
- if (nest2 == NULL) {
- err = -EMSGSIZE;
- goto out;
- }
+ nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+ if (!nest2) {
+ err = -EMSGSIZE;
+ break;
+ }
- for (pp = &mp->ports;
- (p = rcu_dereference(*pp)) != NULL;
- pp = &p->next) {
- struct nlattr *nest_ent;
- struct br_mdb_entry e;
-
- port = p->port;
- if (!port)
- continue;
-
- memset(&e, 0, sizeof(e));
- e.ifindex = port->dev->ifindex;
- e.vid = p->addr.vid;
- __mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
+ for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+ pp = &p->next) {
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+
+ port = p->port;
+ if (!port)
+ continue;
+
+ memset(&e, 0, sizeof(e));
+ e.ifindex = port->dev->ifindex;
+ e.vid = p->addr.vid;
+ __mdb_entry_fill_flags(&e, p->flags);
+ if (p->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = p->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
+ if (p->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = p->addr.u.ip6;
#endif
- e.addr.proto = p->addr.proto;
- nest_ent = nla_nest_start(skb,
- MDBA_MDB_ENTRY_INFO);
- if (!nest_ent) {
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- if (nla_put_nohdr(skb, sizeof(e), &e) ||
- nla_put_u32(skb,
- MDBA_MDB_EATTR_TIMER,
- br_timer_value(&p->timer))) {
- nla_nest_cancel(skb, nest_ent);
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- nla_nest_end(skb, nest_ent);
+ e.addr.proto = p->addr.proto;
+ nest_ent = nla_nest_start(skb, MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent) {
+ nla_nest_cancel(skb, nest2);
+ err = -EMSGSIZE;
+ goto out;
}
- nla_nest_end(skb, nest2);
- skip:
- idx++;
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(&p->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ nla_nest_cancel(skb, nest2);
+ err = -EMSGSIZE;
+ goto out;
+ }
+ nla_nest_end(skb, nest_ent);
}
+ nla_nest_end(skb, nest2);
+skip:
+ idx++;
}
out:
@@ -203,8 +193,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
- /* In theory this could be wrapped to 0... */
- cb->seq = net->dev_base_seq + br_mdb_rehash_seq;
+ cb->seq = net->dev_base_seq;
for_each_netdev_rcu(net, dev) {
if (dev->priv_flags & IFF_EBRIDGE) {
@@ -297,7 +286,6 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv)
struct br_mdb_complete_info *data = priv;
struct net_bridge_port_group __rcu **pp;
struct net_bridge_port_group *p;
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port *port = data->port;
struct net_bridge *br = port->br;
@@ -306,8 +294,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv)
goto err;
spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, &data->ip);
+ mp = br_mdb_ip_get(br, &data->ip);
if (!mp)
goto out;
for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
@@ -344,7 +331,7 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
mdb.obj.orig_dev = dev;
switch (type) {
case RTM_NEWMDB:
- switchdev_port_obj_add(lower_dev, &mdb.obj);
+ switchdev_port_obj_add(lower_dev, &mdb.obj, NULL);
break;
case RTM_DELMDB:
switchdev_port_obj_del(lower_dev, &mdb.obj);
@@ -394,7 +381,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
__mdb_entry_to_br_ip(entry, &complete_info->ip);
mdb.obj.complete_priv = complete_info;
mdb.obj.complete = br_mdb_complete;
- if (switchdev_port_obj_add(port_dev, &mdb.obj))
+ if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL))
kfree(complete_info);
}
} else if (p && port_dev && type == RTM_DELMDB) {
@@ -588,14 +575,12 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_mdb_htable *mdb;
unsigned long now = jiffies;
int err;
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, group);
+ mp = br_mdb_ip_get(br, group);
if (!mp) {
- mp = br_multicast_new_group(br, port, group);
+ mp = br_multicast_new_group(br, group);
err = PTR_ERR_OR_ZERO(mp);
if (err)
return err;
@@ -696,7 +681,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
@@ -709,9 +693,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
__mdb_entry_to_br_ip(entry, &ip);
spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
-
- mp = br_mdb_ip_get(mdb, &ip);
+ mp = br_mdb_ip_get(br, &ip);
if (!mp)
goto unlock;
@@ -728,7 +710,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ kfree_rcu(p, rcu);
err = 0;
if (!mp->ports && !mp->host_joined &&
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 6bac0d6b7b94..3aeff0895669 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -37,6 +37,14 @@
#include "br_private.h"
+static const struct rhashtable_params br_mdb_rht_params = {
+ .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode),
+ .key_offset = offsetof(struct net_bridge_mdb_entry, addr),
+ .key_len = sizeof(struct br_ip),
+ .automatic_shrinking = true,
+ .locks_mul = 1,
+};
+
static void br_multicast_start_querier(struct net_bridge *br,
struct bridge_mcast_own_query *query);
static void br_multicast_add_router(struct net_bridge *br,
@@ -54,7 +62,6 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
const struct in6_addr *group,
__u16 vid, const unsigned char *src);
#endif
-unsigned int br_mdb_rehash_seq;
static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
{
@@ -73,89 +80,58 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
return 0;
}
-static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip,
- __u16 vid)
-{
- return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1);
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb,
- const struct in6_addr *ip,
- __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br,
+ struct br_ip *dst)
{
- return jhash_2words(ipv6_addr_hash(ip), vid,
- mdb->secret) & (mdb->max - 1);
+ return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
}
-#endif
-static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb,
- struct br_ip *ip)
-{
- switch (ip->proto) {
- case htons(ETH_P_IP):
- return __br_ip4_hash(mdb, ip->u.ip4, ip->vid);
-#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid);
-#endif
- }
- return 0;
-}
-
-static struct net_bridge_mdb_entry *__br_mdb_ip_get(
- struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash)
+struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br,
+ struct br_ip *dst)
{
- struct net_bridge_mdb_entry *mp;
-
- hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
- if (br_ip_equal(&mp->addr, dst))
- return mp;
- }
+ struct net_bridge_mdb_entry *ent;
- return NULL;
-}
+ lockdep_assert_held_once(&br->multicast_lock);
-struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb,
- struct br_ip *dst)
-{
- if (!mdb)
- return NULL;
+ rcu_read_lock();
+ ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
+ rcu_read_unlock();
- return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst));
+ return ent;
}
-static struct net_bridge_mdb_entry *br_mdb_ip4_get(
- struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br,
+ __be32 dst, __u16 vid)
{
struct br_ip br_dst;
+ memset(&br_dst, 0, sizeof(br_dst));
br_dst.u.ip4 = dst;
br_dst.proto = htons(ETH_P_IP);
br_dst.vid = vid;
- return br_mdb_ip_get(mdb, &br_dst);
+ return br_mdb_ip_get(br, &br_dst);
}
#if IS_ENABLED(CONFIG_IPV6)
-static struct net_bridge_mdb_entry *br_mdb_ip6_get(
- struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst,
- __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br,
+ const struct in6_addr *dst,
+ __u16 vid)
{
struct br_ip br_dst;
+ memset(&br_dst, 0, sizeof(br_dst));
br_dst.u.ip6 = *dst;
br_dst.proto = htons(ETH_P_IPV6);
br_dst.vid = vid;
- return br_mdb_ip_get(mdb, &br_dst);
+ return br_mdb_ip_get(br, &br_dst);
}
#endif
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct sk_buff *skb, u16 vid)
{
- struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
struct br_ip ip;
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
@@ -164,6 +140,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
if (BR_INPUT_SKB_CB(skb)->igmp)
return NULL;
+ memset(&ip, 0, sizeof(ip));
ip.proto = skb->protocol;
ip.vid = vid;
@@ -180,70 +157,13 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
return NULL;
}
- return br_mdb_ip_get(mdb, &ip);
-}
-
-static void br_mdb_free(struct rcu_head *head)
-{
- struct net_bridge_mdb_htable *mdb =
- container_of(head, struct net_bridge_mdb_htable, rcu);
- struct net_bridge_mdb_htable *old = mdb->old;
-
- mdb->old = NULL;
- kfree(old->mhash);
- kfree(old);
-}
-
-static int br_mdb_copy(struct net_bridge_mdb_htable *new,
- struct net_bridge_mdb_htable *old,
- int elasticity)
-{
- struct net_bridge_mdb_entry *mp;
- int maxlen;
- int len;
- int i;
-
- for (i = 0; i < old->max; i++)
- hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver])
- hlist_add_head(&mp->hlist[new->ver],
- &new->mhash[br_ip_hash(new, &mp->addr)]);
-
- if (!elasticity)
- return 0;
-
- maxlen = 0;
- for (i = 0; i < new->max; i++) {
- len = 0;
- hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver])
- len++;
- if (len > maxlen)
- maxlen = len;
- }
-
- return maxlen > elasticity ? -EINVAL : 0;
-}
-
-void br_multicast_free_pg(struct rcu_head *head)
-{
- struct net_bridge_port_group *p =
- container_of(head, struct net_bridge_port_group, rcu);
-
- kfree(p);
-}
-
-static void br_multicast_free_group(struct rcu_head *head)
-{
- struct net_bridge_mdb_entry *mp =
- container_of(head, struct net_bridge_mdb_entry, rcu);
-
- kfree(mp);
+ return br_mdb_ip_get_rcu(br, &ip);
}
static void br_multicast_group_expired(struct timer_list *t)
{
struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer);
struct net_bridge *br = mp->br;
- struct net_bridge_mdb_htable *mdb;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) || timer_pending(&mp->timer))
@@ -255,12 +175,11 @@ static void br_multicast_group_expired(struct timer_list *t)
if (mp->ports)
goto out;
- mdb = mlock_dereference(br->mdb, br);
-
- hlist_del_rcu(&mp->hlist[mdb->ver]);
- mdb->size--;
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_rcu(&mp->mdb_node);
- call_rcu_bh(&mp->rcu, br_multicast_free_group);
+ kfree_rcu(mp, rcu);
out:
spin_unlock(&br->multicast_lock);
@@ -269,14 +188,11 @@ out:
static void br_multicast_del_pg(struct net_bridge *br,
struct net_bridge_port_group *pg)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- mdb = mlock_dereference(br->mdb, br);
-
- mp = br_mdb_ip_get(mdb, &pg->addr);
+ mp = br_mdb_ip_get(br, &pg->addr);
if (WARN_ON(!mp))
return;
@@ -291,7 +207,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
del_timer(&p->timer);
br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
p->flags);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ kfree_rcu(p, rcu);
if (!mp->ports && !mp->host_joined &&
netif_running(br->dev))
@@ -319,53 +235,6 @@ out:
spin_unlock(&br->multicast_lock);
}
-static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
- int elasticity)
-{
- struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1);
- struct net_bridge_mdb_htable *mdb;
- int err;
-
- mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC);
- if (!mdb)
- return -ENOMEM;
-
- mdb->max = max;
- mdb->old = old;
-
- mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC);
- if (!mdb->mhash) {
- kfree(mdb);
- return -ENOMEM;
- }
-
- mdb->size = old ? old->size : 0;
- mdb->ver = old ? old->ver ^ 1 : 0;
-
- if (!old || elasticity)
- get_random_bytes(&mdb->secret, sizeof(mdb->secret));
- else
- mdb->secret = old->secret;
-
- if (!old)
- goto out;
-
- err = br_mdb_copy(mdb, old, elasticity);
- if (err) {
- kfree(mdb->mhash);
- kfree(mdb);
- return err;
- }
-
- br_mdb_rehash_seq++;
- call_rcu_bh(&mdb->rcu, br_mdb_free);
-
-out:
- rcu_assign_pointer(*mdbp, mdb);
-
- return 0;
-}
-
static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
__be32 group,
u8 *igmp_type)
@@ -589,111 +458,19 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
return NULL;
}
-static struct net_bridge_mdb_entry *br_multicast_get_group(
- struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, int hash)
-{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- unsigned int count = 0;
- unsigned int max;
- int elasticity;
- int err;
-
- mdb = rcu_dereference_protected(br->mdb, 1);
- hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
- count++;
- if (unlikely(br_ip_equal(group, &mp->addr)))
- return mp;
- }
-
- elasticity = 0;
- max = mdb->max;
-
- if (unlikely(count > br->hash_elasticity && count)) {
- if (net_ratelimit())
- br_info(br, "Multicast hash table "
- "chain limit reached: %s\n",
- port ? port->dev->name : br->dev->name);
-
- elasticity = br->hash_elasticity;
- }
-
- if (mdb->size >= max) {
- max *= 2;
- if (unlikely(max > br->hash_max)) {
- br_warn(br, "Multicast hash table maximum of %d "
- "reached, disabling snooping: %s\n",
- br->hash_max,
- port ? port->dev->name : br->dev->name);
- err = -E2BIG;
-disable:
- br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
- goto err;
- }
- }
-
- if (max > mdb->max || elasticity) {
- if (mdb->old) {
- if (net_ratelimit())
- br_info(br, "Multicast hash table "
- "on fire: %s\n",
- port ? port->dev->name : br->dev->name);
- err = -EEXIST;
- goto err;
- }
-
- err = br_mdb_rehash(&br->mdb, max, elasticity);
- if (err) {
- br_warn(br, "Cannot rehash multicast "
- "hash table, disabling snooping: %s, %d, %d\n",
- port ? port->dev->name : br->dev->name,
- mdb->size, err);
- goto disable;
- }
-
- err = -EAGAIN;
- goto err;
- }
-
- return NULL;
-
-err:
- mp = ERR_PTR(err);
- return mp;
-}
-
struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
- struct net_bridge_port *p,
struct br_ip *group)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
- int hash;
int err;
- mdb = rcu_dereference_protected(br->mdb, 1);
- if (!mdb) {
- err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0);
- if (err)
- return ERR_PTR(err);
- goto rehash;
- }
-
- hash = br_ip_hash(mdb, group);
- mp = br_multicast_get_group(br, p, group, hash);
- switch (PTR_ERR(mp)) {
- case 0:
- break;
-
- case -EAGAIN:
-rehash:
- mdb = rcu_dereference_protected(br->mdb, 1);
- hash = br_ip_hash(mdb, group);
- break;
+ mp = br_mdb_ip_get(br, group);
+ if (mp)
+ return mp;
- default:
- goto out;
+ if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) {
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
+ return ERR_PTR(-E2BIG);
}
mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
@@ -703,11 +480,15 @@ rehash:
mp->br = br;
mp->addr = *group;
timer_setup(&mp->timer, br_multicast_group_expired, 0);
+ err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ if (err) {
+ kfree(mp);
+ mp = ERR_PTR(err);
+ } else {
+ hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list);
+ }
- hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
- mdb->size++;
-
-out:
return mp;
}
@@ -768,7 +549,7 @@ static int br_multicast_add_group(struct net_bridge *br,
(port && port->state == BR_STATE_DISABLED))
goto out;
- mp = br_multicast_new_group(br, port, group);
+ mp = br_multicast_new_group(br, group);
err = PTR_ERR(mp);
if (IS_ERR(mp))
goto err;
@@ -837,6 +618,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
if (ipv6_addr_is_ll_all_nodes(group))
return 0;
+ memset(&br_group, 0, sizeof(br_group));
br_group.u.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
@@ -1483,7 +1265,7 @@ static void br_ip4_multicast_query(struct net_bridge *br,
goto out;
}
- mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid);
+ mp = br_mdb_ip4_get(br, group, vid);
if (!mp)
goto out;
@@ -1567,7 +1349,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
goto out;
}
- mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid);
+ mp = br_mdb_ip6_get(br, group, vid);
if (!mp)
goto out;
@@ -1601,7 +1383,6 @@ br_multicast_leave_group(struct net_bridge *br,
struct bridge_mcast_own_query *own_query,
const unsigned char *src)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
unsigned long now;
@@ -1612,8 +1393,7 @@ br_multicast_leave_group(struct net_bridge *br,
(port && port->state == BR_STATE_DISABLED))
goto out;
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, group);
+ mp = br_mdb_ip_get(br, group);
if (!mp)
goto out;
@@ -1629,7 +1409,7 @@ br_multicast_leave_group(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ kfree_rcu(p, rcu);
br_mdb_notify(br->dev, port, group, RTM_DELMDB,
p->flags);
@@ -1961,8 +1741,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t)
void br_multicast_init(struct net_bridge *br)
{
- br->hash_elasticity = 4;
- br->hash_max = 512;
+ br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
br->multicast_last_member_count = 2;
@@ -1999,6 +1778,7 @@ void br_multicast_init(struct net_bridge *br)
timer_setup(&br->ip6_own_query.timer,
br_ip6_multicast_query_expired, 0);
#endif
+ INIT_HLIST_HEAD(&br->mdb_list);
}
static void __br_multicast_open(struct net_bridge *br,
@@ -2033,40 +1813,20 @@ void br_multicast_stop(struct net_bridge *br)
void br_multicast_dev_del(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
- struct hlist_node *n;
- u32 ver;
- int i;
+ struct hlist_node *tmp;
spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
- if (!mdb)
- goto out;
-
- br->mdb = NULL;
-
- ver = mdb->ver;
- for (i = 0; i < mdb->max; i++) {
- hlist_for_each_entry_safe(mp, n, &mdb->mhash[i],
- hlist[ver]) {
- del_timer(&mp->timer);
- call_rcu_bh(&mp->rcu, br_multicast_free_group);
- }
- }
-
- if (mdb->old) {
- spin_unlock_bh(&br->multicast_lock);
- rcu_barrier_bh();
- spin_lock_bh(&br->multicast_lock);
- WARN_ON(mdb->old);
+ hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) {
+ del_timer(&mp->timer);
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_rcu(&mp->mdb_node);
+ kfree_rcu(mp, rcu);
}
-
- mdb->old = mdb;
- call_rcu_bh(&mdb->rcu, br_mdb_free);
-
-out:
spin_unlock_bh(&br->multicast_lock);
+
+ rcu_barrier();
}
int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2176,9 +1936,7 @@ static void br_multicast_start_querier(struct net_bridge *br,
int br_multicast_toggle(struct net_bridge *br, unsigned long val)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_port *port;
- int err = 0;
spin_lock_bh(&br->multicast_lock);
if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
@@ -2192,21 +1950,6 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
if (!netif_running(br->dev))
goto unlock;
- mdb = mlock_dereference(br->mdb, br);
- if (mdb) {
- if (mdb->old) {
- err = -EEXIST;
-rollback:
- br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
- goto unlock;
- }
-
- err = br_mdb_rehash(&br->mdb, mdb->max,
- br->hash_elasticity);
- if (err)
- goto rollback;
- }
-
br_multicast_open(br);
list_for_each_entry(port, &br->port_list, list)
__br_multicast_enable_port(port);
@@ -2214,7 +1957,7 @@ rollback:
unlock:
spin_unlock_bh(&br->multicast_lock);
- return err;
+ return 0;
}
bool br_multicast_enabled(const struct net_device *dev)
@@ -2271,45 +2014,6 @@ unlock:
return 0;
}
-int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val)
-{
- int err = -EINVAL;
- u32 old;
- struct net_bridge_mdb_htable *mdb;
-
- spin_lock_bh(&br->multicast_lock);
- if (!is_power_of_2(val))
- goto unlock;
-
- mdb = mlock_dereference(br->mdb, br);
- if (mdb && val < mdb->size)
- goto unlock;
-
- err = 0;
-
- old = br->hash_max;
- br->hash_max = val;
-
- if (mdb) {
- if (mdb->old) {
- err = -EEXIST;
-rollback:
- br->hash_max = old;
- goto unlock;
- }
-
- err = br_mdb_rehash(&br->mdb, br->hash_max,
- br->hash_elasticity);
- if (err)
- goto rollback;
- }
-
-unlock:
- spin_unlock_bh(&br->multicast_lock);
-
- return err;
-}
-
int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
{
/* Currently we support only version 2 and 3 */
@@ -2646,3 +2350,13 @@ void br_multicast_get_stats(const struct net_bridge *br,
}
memcpy(dest, &tdst, sizeof(*dest));
}
+
+int br_mdb_hash_init(struct net_bridge *br)
+{
+ return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params);
+}
+
+void br_mdb_hash_fini(struct net_bridge *br)
+{
+ rhashtable_destroy(&br->mdb_hash_tbl);
+}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index b1b5e8516724..d21a23698410 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -132,10 +132,7 @@ static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
static void nf_bridge_info_free(struct sk_buff *skb)
{
- if (skb->nf_bridge) {
- nf_bridge_put(skb->nf_bridge);
- skb->nf_bridge = NULL;
- }
+ skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
}
static inline struct net_device *bridge_parent(const struct net_device *dev)
@@ -148,19 +145,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev)
static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
{
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-
- if (refcount_read(&nf_bridge->use) > 1) {
- struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
-
- if (tmp) {
- memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
- refcount_set(&tmp->use, 1);
- }
- nf_bridge_put(nf_bridge);
- nf_bridge = tmp;
- }
- return nf_bridge;
+ return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
}
unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
@@ -247,7 +232,9 @@ drop:
void nf_bridge_update_protocol(struct sk_buff *skb)
{
- switch (skb->nf_bridge->orig_proto) {
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ switch (nf_bridge->orig_proto) {
case BRNF_PROTO_8021Q:
skb->protocol = htons(ETH_P_8021Q);
break;
@@ -506,7 +493,6 @@ static unsigned int br_nf_pre_routing(void *priv,
if (br_validate_ipv4(state->net, skb))
return NF_DROP;
- nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
return NF_DROP;
if (!setup_pre_routing(skb))
@@ -569,7 +555,8 @@ static unsigned int br_nf_forward_ip(void *priv,
struct net_device *parent;
u_int8_t pf;
- if (!skb->nf_bridge)
+ nf_bridge = nf_bridge_info_get(skb);
+ if (!nf_bridge)
return NF_ACCEPT;
/* Need exclusive nf_bridge_info since we might have multiple
@@ -671,10 +658,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff
return 0;
}
- if (data->vlan_tci) {
- skb->vlan_tci = data->vlan_tci;
- skb->vlan_proto = data->vlan_proto;
- }
+ if (data->vlan_proto)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
__skb_push(skb, data->encap_size);
@@ -703,7 +688,9 @@ br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
{
- if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
return PPPOE_SES_HLEN;
return 0;
}
@@ -740,8 +727,13 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
data = this_cpu_ptr(&brnf_frag_data_storage);
- data->vlan_tci = skb->vlan_tci;
- data->vlan_proto = skb->vlan_proto;
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_proto = 0;
+ }
+
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
@@ -836,7 +828,9 @@ static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge && !nf_bridge->in_prerouting &&
!netif_is_l3_master(skb->dev)) {
state->okfn(state->net, state->sk, skb);
return NF_STOLEN;
@@ -874,7 +868,9 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
static int br_nf_dev_xmit(struct sk_buff *skb)
{
- if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge && nf_bridge->bridged_dnat) {
br_nf_pre_routing_finish_bridge_slow(skb);
return 1;
}
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 96c072e71ea2..94039f588f1d 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -224,8 +224,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
if (br_validate_ipv6(state->net, skb))
return NF_DROP;
- nf_bridge_put(skb->nf_bridge);
- if (!nf_bridge_alloc(skb))
+ nf_bridge = nf_bridge_alloc(skb);
+ if (!nf_bridge)
return NF_DROP;
if (!setup_pre_routing(skb))
return NF_DROP;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3345f1984542..9c07591b0232 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -525,7 +525,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
}
static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
- int cmd, struct bridge_vlan_info *vinfo, bool *changed)
+ int cmd, struct bridge_vlan_info *vinfo, bool *changed,
+ struct netlink_ext_ack *extack)
{
bool curr_change;
int err = 0;
@@ -537,11 +538,11 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
* per-VLAN entry as well
*/
err = nbp_vlan_add(p, vinfo->vid, vinfo->flags,
- &curr_change);
+ &curr_change, extack);
} else {
vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY;
err = br_vlan_add(br, vinfo->vid, vinfo->flags,
- &curr_change);
+ &curr_change, extack);
}
if (curr_change)
*changed = true;
@@ -568,7 +569,8 @@ static int br_process_vlan_info(struct net_bridge *br,
struct net_bridge_port *p, int cmd,
struct bridge_vlan_info *vinfo_curr,
struct bridge_vlan_info **vinfo_last,
- bool *changed)
+ bool *changed,
+ struct netlink_ext_ack *extack)
{
if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
return -EINVAL;
@@ -598,7 +600,8 @@ static int br_process_vlan_info(struct net_bridge *br,
sizeof(struct bridge_vlan_info));
for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
tmp_vinfo.vid = v;
- err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed);
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed,
+ extack);
if (err)
break;
}
@@ -607,13 +610,14 @@ static int br_process_vlan_info(struct net_bridge *br,
return err;
}
- return br_vlan_info(br, p, cmd, vinfo_curr, changed);
+ return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack);
}
static int br_afspec(struct net_bridge *br,
struct net_bridge_port *p,
struct nlattr *af_spec,
- int cmd, bool *changed)
+ int cmd, bool *changed,
+ struct netlink_ext_ack *extack)
{
struct bridge_vlan_info *vinfo_curr = NULL;
struct bridge_vlan_info *vinfo_last = NULL;
@@ -643,7 +647,8 @@ static int br_afspec(struct net_bridge *br,
return -EINVAL;
vinfo_curr = nla_data(attr);
err = br_process_vlan_info(br, p, cmd, vinfo_curr,
- &vinfo_last, changed);
+ &vinfo_last, changed,
+ extack);
if (err)
return err;
break;
@@ -850,7 +855,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
}
/* Change state and parameters on port. */
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags,
+ struct netlink_ext_ack *extack)
{
struct net_bridge *br = (struct net_bridge *)netdev_priv(dev);
struct nlattr *tb[IFLA_BRPORT_MAX + 1];
@@ -897,7 +903,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
}
if (afspec)
- err = br_afspec(br, p, afspec, RTM_SETLINK, &changed);
+ err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack);
if (changed)
br_ifinfo_notify(RTM_NEWLINK, br, p);
@@ -923,7 +929,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
if (!p && !(dev->priv_flags & IFF_EBRIDGE))
return -EINVAL;
- err = br_afspec(br, p, afspec, RTM_DELLINK, &changed);
+ err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL);
if (changed)
/* Send RTM_NEWLINK because userspace
* expects RTM_NEWLINK for vlan dels
@@ -1035,6 +1041,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
+ [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct br_boolopt_multi) },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1103,7 +1111,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
__u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
- err = __br_vlan_set_default_pvid(br, defpvid);
+ err = __br_vlan_set_default_pvid(br, defpvid, extack);
if (err)
return err;
}
@@ -1167,9 +1175,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_MCAST_SNOOPING]) {
u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]);
- err = br_multicast_toggle(br, mcast_snooping);
- if (err)
- return err;
+ br_multicast_toggle(br, mcast_snooping);
}
if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) {
@@ -1187,19 +1193,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
return err;
}
- if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) {
- u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]);
-
- br->hash_elasticity = val;
- }
+ if (data[IFLA_BR_MCAST_HASH_ELASTICITY])
+ br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n",
+ RHT_ELASTICITY);
- if (data[IFLA_BR_MCAST_HASH_MAX]) {
- u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
-
- err = br_multicast_set_hash_max(br, hash_max);
- if (err)
- return err;
- }
+ if (data[IFLA_BR_MCAST_HASH_MAX])
+ br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) {
u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]);
@@ -1296,6 +1295,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
}
#endif
+ if (data[IFLA_BR_MULTI_BOOLOPT]) {
+ struct br_boolopt_multi *bm;
+
+ bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]);
+ err = br_boolopt_multi_toggle(br, bm, extack);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -1374,6 +1382,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */
#endif
+ nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */
0;
}
@@ -1387,6 +1396,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
u32 stp_enabled = br->stp_enabled;
u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
u8 vlan_enabled = br_vlan_enabled(br->dev);
+ struct br_boolopt_multi bm;
u64 clockval;
clockval = br_timer_value(&br->hello_timer);
@@ -1403,6 +1413,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
+ br_boolopt_multi_get(br, &bm);
if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
@@ -1420,7 +1431,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) ||
nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
br->topology_change_detected) ||
- nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr))
+ nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) ||
+ nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm))
return -EMSGSIZE;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
@@ -1442,8 +1454,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
- nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
- br->hash_elasticity) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 04c19a37e500..d240b3e7919f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -31,6 +31,8 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
+#define BR_MULTICAST_DEFAULT_HASH_MAX 4096
+
#define BR_VERSION "2.3"
/* Control of forwarding link local multicast */
@@ -213,23 +215,14 @@ struct net_bridge_port_group {
};
struct net_bridge_mdb_entry {
- struct hlist_node hlist[2];
+ struct rhash_head rhnode;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
struct rcu_head rcu;
struct timer_list timer;
struct br_ip addr;
bool host_joined;
-};
-
-struct net_bridge_mdb_htable {
- struct hlist_head *mhash;
- struct rcu_head rcu;
- struct net_bridge_mdb_htable *old;
- u32 size;
- u32 max;
- u32 secret;
- u32 ver;
+ struct hlist_node mdb_node;
};
struct net_bridge_port {
@@ -328,6 +321,7 @@ enum net_bridge_opts {
BROPT_NEIGH_SUPPRESS_ENABLED,
BROPT_MTU_SET_BY_USER,
BROPT_VLAN_STATS_PER_PORT,
+ BROPT_NO_LL_LEARN,
};
struct net_bridge {
@@ -380,7 +374,6 @@ struct net_bridge {
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- u32 hash_elasticity;
u32 hash_max;
u32 multicast_last_member_count;
@@ -399,7 +392,9 @@ struct net_bridge {
unsigned long multicast_query_response_interval;
unsigned long multicast_startup_query_interval;
- struct net_bridge_mdb_htable __rcu *mdb;
+ struct rhashtable mdb_hash_tbl;
+
+ struct hlist_head mdb_list;
struct hlist_head router_list;
struct timer_list multicast_router_timer;
@@ -507,6 +502,14 @@ static inline int br_opt_get(const struct net_bridge *br,
return test_bit(opt, &br->options);
}
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack);
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt);
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack);
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm);
void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
/* br_device.c */
@@ -572,6 +575,9 @@ int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
const unsigned char *addr, u16 vid, u16 nlh_flags);
int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev, struct net_device *fdev, int *idx);
+int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
+ const unsigned char *addr, u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack);
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
@@ -650,7 +656,6 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
/* br_multicast.c */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
-extern unsigned int br_mdb_rehash_seq;
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
@@ -675,17 +680,15 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
#endif
struct net_bridge_mdb_entry *
-br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
+br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
struct net_bridge_mdb_entry *
-br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group);
-void br_multicast_free_pg(struct rcu_head *head);
+br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
unsigned char flags, const unsigned char *src);
-void br_mdb_init(void);
-void br_mdb_uninit(void);
+int br_mdb_hash_init(struct net_bridge *br);
+void br_mdb_hash_fini(struct net_bridge *br);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
struct br_ip *group, int type, u8 flags);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -697,6 +700,8 @@ void br_multicast_uninit_stats(struct net_bridge *br);
void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
+void br_mdb_init(void);
+void br_mdb_uninit(void);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -822,6 +827,15 @@ static inline void br_mdb_uninit(void)
{
}
+static inline int br_mdb_hash_init(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline void br_mdb_hash_fini(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_count(struct net_bridge *br,
const struct net_bridge_port *p,
const struct sk_buff *skb,
@@ -857,7 +871,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb);
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
- bool *changed);
+ bool *changed, struct netlink_ext_ack *extack);
int br_vlan_delete(struct net_bridge *br, u16 vid);
void br_vlan_flush(struct net_bridge *br);
struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid);
@@ -870,12 +884,13 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
-int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack);
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed);
+ bool *changed, struct netlink_ext_ack *extack);
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
void nbp_vlan_flush(struct net_bridge_port *port);
-int nbp_vlan_init(struct net_bridge_port *port);
+int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
void br_vlan_get_stats(const struct net_bridge_vlan *v,
struct br_vlan_stats *stats);
@@ -912,7 +927,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
int err = 0;
if (skb_vlan_tag_present(skb)) {
- *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
+ *vid = skb_vlan_tag_get_id(skb);
} else {
*vid = 0;
err = -EINVAL;
@@ -960,7 +975,7 @@ static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
}
static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
*changed = false;
return -EOPNOTSUPP;
@@ -985,7 +1000,7 @@ static inline int br_vlan_init(struct net_bridge *br)
}
static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
*changed = false;
return -EOPNOTSUPP;
@@ -1006,7 +1021,8 @@ static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group
return NULL;
}
-static inline int nbp_vlan_init(struct net_bridge_port *port)
+static inline int nbp_vlan_init(struct net_bridge_port *port,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -1127,7 +1143,8 @@ int br_netlink_init(void);
void br_netlink_fini(void);
void br_ifinfo_notify(int event, const struct net_bridge *br,
const struct net_bridge_port *port);
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
+ struct netlink_ext_ack *extack);
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
u32 filter_mask, int nlflags);
@@ -1162,7 +1179,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long mask);
void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
int type);
-int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ struct netlink_ext_ack *extack);
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
@@ -1194,7 +1212,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
}
static inline int br_switchdev_port_vlan_add(struct net_device *dev,
- u16 vid, u16 flags)
+ u16 vid, u16 flags,
+ struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
}
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index b993df770675..035ff59d9cbd 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -140,7 +140,8 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
}
}
-int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ struct netlink_ext_ack *extack)
{
struct switchdev_obj_port_vlan v = {
.obj.orig_dev = dev,
@@ -150,7 +151,7 @@ int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
.vid_end = vid,
};
- return switchdev_port_obj_add(dev, &v.obj);
+ return switchdev_port_obj_add(dev, &v.obj, extack);
}
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 60182bef6341..b05b94e9c595 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -328,6 +328,27 @@ static ssize_t flush_store(struct device *d,
}
static DEVICE_ATTR_WO(flush);
+static ssize_t no_linklocal_learn_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
+}
+
+static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val)
+{
+ return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL);
+}
+
+static ssize_t no_linklocal_learn_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_no_linklocal_learn);
+}
+static DEVICE_ATTR_RW(no_linklocal_learn);
+
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t multicast_router_show(struct device *d,
struct device_attribute *attr, char *buf)
@@ -403,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier);
static ssize_t hash_elasticity_show(struct device *d,
struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->hash_elasticity);
+ return sprintf(buf, "%u\n", RHT_ELASTICITY);
}
static int set_elasticity(struct net_bridge *br, unsigned long val)
{
- br->hash_elasticity = val;
+ br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n",
+ RHT_ELASTICITY);
return 0;
}
@@ -428,10 +449,16 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr,
return sprintf(buf, "%u\n", br->hash_max);
}
+static int set_hash_max(struct net_bridge *br, unsigned long val)
+{
+ br->hash_max = val;
+ return 0;
+}
+
static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_hash_max);
+ return store_bridge_parm(d, buf, len, set_hash_max);
}
static DEVICE_ATTR_RW(hash_max);
@@ -841,6 +868,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_gc_timer.attr,
&dev_attr_group_addr.attr,
&dev_attr_flush.attr,
+ &dev_attr_no_linklocal_learn.attr,
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
&dev_attr_multicast_router.attr,
&dev_attr_multicast_snooping.attr,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 7c87a2fe5248..88715edb119a 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -320,9 +320,6 @@ static ssize_t brport_store(struct kobject *kobj,
if (!rtnl_trylock())
return restart_syscall();
- if (!p->dev || !p->br)
- goto out_unlock;
-
if (brport_attr->store_raw) {
char *buf_copy;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index e84be08b8285..4a2f31157ef5 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -80,14 +80,14 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
}
static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
- u16 vid, u16 flags)
+ u16 vid, u16 flags, struct netlink_ext_ack *extack)
{
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q add.
*/
- err = br_switchdev_port_vlan_add(dev, vid, flags);
+ err = br_switchdev_port_vlan_add(dev, vid, flags, extack);
if (err == -EOPNOTSUPP)
return vlan_vid_add(dev, br->vlan_proto, vid);
return err;
@@ -139,7 +139,9 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
/* Returns a master vlan, if it didn't exist it gets created. In all cases a
* a reference is taken to the master vlan before returning.
*/
-static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid)
+static struct net_bridge_vlan *
+br_vlan_get_master(struct net_bridge *br, u16 vid,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *masterv;
@@ -150,7 +152,7 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid
bool changed;
/* missing global ctx, create it now */
- if (br_vlan_add(br, vid, 0, &changed))
+ if (br_vlan_add(br, vid, 0, &changed, extack))
return NULL;
masterv = br_vlan_find(vg, vid);
if (WARN_ON(!masterv))
@@ -214,7 +216,8 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu)
* 4. same as 3 but with both master and brentry flags set so the entry
* will be used for filtering in both the port and the bridge
*/
-static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
+static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan *masterv = NULL;
struct net_bridge_port *p = NULL;
@@ -239,7 +242,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
* This ensures tagged traffic enters the bridge when
* promiscuous mode is disabled by br_manage_promisc().
*/
- err = __vlan_vid_add(dev, br, v->vid, flags);
+ err = __vlan_vid_add(dev, br, v->vid, flags, extack);
if (err)
goto out;
@@ -249,12 +252,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
err = br_vlan_add(br, v->vid,
flags | BRIDGE_VLAN_INFO_BRENTRY,
- &changed);
+ &changed, extack);
if (err)
goto out_filt;
}
- masterv = br_vlan_get_master(br, v->vid);
+ masterv = br_vlan_get_master(br, v->vid, extack);
if (!masterv)
goto out_filt;
v->brvlan = masterv;
@@ -269,7 +272,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
v->stats = masterv->stats;
}
} else {
- err = br_switchdev_port_vlan_add(dev, v->vid, flags);
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack);
if (err && err != -EOPNOTSUPP)
goto out;
}
@@ -421,7 +424,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
}
if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
if (p && (p->flags & BR_VLAN_TUNNEL) &&
br_handle_egress_vlan_tunnel(skb, v)) {
@@ -494,8 +497,8 @@ static bool __allowed_ingress(const struct net_bridge *br,
__vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
else
/* Priority-tagged Frame.
- * At this point, We know that skb->vlan_tci had
- * VLAN_TAG_PRESENT bit and its VID field was 0x000.
+ * At this point, we know that skb->vlan_tci VID
+ * field was 0.
* We update only VID field and preserve PCP field.
*/
skb->vlan_tci |= pvid;
@@ -591,11 +594,12 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
static int br_vlan_add_existing(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan,
- u16 flags, bool *changed)
+ u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
{
int err;
- err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack);
if (err && err != -EOPNOTSUPP)
return err;
@@ -634,7 +638,8 @@ err_flags:
* Must be called with vid in range from 1 to 4094 inclusive.
* changed must be true only if the vlan was created or updated
*/
-int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *vlan;
@@ -646,7 +651,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
vg = br_vlan_group(br);
vlan = br_vlan_find(vg, vid);
if (vlan)
- return br_vlan_add_existing(br, vg, vlan, flags, changed);
+ return br_vlan_add_existing(br, vg, vlan, flags, changed,
+ extack);
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
if (!vlan)
@@ -663,7 +669,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
vlan->br = br;
if (flags & BRIDGE_VLAN_INFO_BRENTRY)
refcount_set(&vlan->refcnt, 1);
- ret = __vlan_add(vlan, flags);
+ ret = __vlan_add(vlan, flags, extack);
if (ret) {
free_percpu(vlan->stats);
kfree(vlan);
@@ -914,7 +920,8 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br)
br->default_pvid = 0;
}
-int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack)
{
const struct net_bridge_vlan *pvent;
struct net_bridge_vlan_group *vg;
@@ -946,7 +953,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY,
- &vlchange);
+ &vlchange, extack);
if (err)
goto out;
br_vlan_delete(br, old_pvid);
@@ -966,7 +973,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
err = nbp_vlan_add(p, pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &vlchange);
+ &vlchange, extack);
if (err)
goto err_port;
nbp_vlan_delete(p, old_pvid);
@@ -988,7 +995,7 @@ err_port:
nbp_vlan_add(p, old_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &vlchange);
+ &vlchange, NULL);
nbp_vlan_delete(p, pvid);
}
@@ -998,7 +1005,7 @@ err_port:
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY,
- &vlchange);
+ &vlchange, NULL);
br_vlan_delete(br, pvid);
}
goto out;
@@ -1021,7 +1028,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
err = -EPERM;
goto out;
}
- err = __br_vlan_set_default_pvid(br, pvid);
+ err = __br_vlan_set_default_pvid(br, pvid, NULL);
out:
return err;
}
@@ -1047,7 +1054,7 @@ int br_vlan_init(struct net_bridge *br)
rcu_assign_pointer(br->vlgrp, vg);
ret = br_vlan_add(br, 1,
BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED |
- BRIDGE_VLAN_INFO_BRENTRY, &changed);
+ BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
if (ret)
goto err_vlan_add;
@@ -1064,7 +1071,7 @@ err_rhtbl:
goto out;
}
-int nbp_vlan_init(struct net_bridge_port *p)
+int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
{
struct switchdev_attr attr = {
.orig_dev = p->br->dev,
@@ -1097,7 +1104,7 @@ int nbp_vlan_init(struct net_bridge_port *p)
ret = nbp_vlan_add(p, p->br->default_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &changed);
+ &changed, extack);
if (ret)
goto err_vlan_add;
}
@@ -1122,7 +1129,7 @@ err_vlan_enabled:
* changed must be true only if the vlan was created or updated
*/
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
struct net_bridge_vlan *vlan;
int ret;
@@ -1133,7 +1140,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
/* Pass the flags to the hardware bridge */
- ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack);
if (ret && ret != -EOPNOTSUPP)
return ret;
*changed = __vlan_add_flags(vlan, flags);
@@ -1147,7 +1154,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
vlan->vid = vid;
vlan->port = port;
- ret = __vlan_add(vlan, flags);
+ ret = __vlan_add(vlan, flags, extack);
if (ret)
kfree(vlan);
else
@@ -1217,9 +1224,13 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
{
struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
ASSERT_RTNL();
- if (netif_is_bridge_master(dev))
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
vg = br_vlan_group(netdev_priv(dev));
else
return -EINVAL;
diff --git a/net/can/raw.c b/net/can/raw.c
index 3aab7664933f..c70207537488 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -771,7 +771,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
if (err < 0)
goto free_skb;
- sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sk->sk_tsflags);
skb->dev = dev;
skb->sk = sk;
diff --git a/net/compat.c b/net/compat.c
index 47a614b370cd..f7084780a8f8 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -810,34 +810,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen);
}
-static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
- unsigned int vlen, unsigned int flags,
- struct old_timespec32 __user *timeout)
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout)
{
- int datagrams;
- struct timespec64 ktspec;
-
- if (timeout == NULL)
- return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
- flags | MSG_CMSG_COMPAT, NULL);
-
- if (compat_get_timespec64(&ktspec, timeout))
- return -EFAULT;
-
- datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
- flags | MSG_CMSG_COMPAT, &ktspec);
- if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout))
- datagrams = -EFAULT;
-
- return datagrams;
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, timeout, NULL);
}
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags,
struct old_timespec32 __user *, timeout)
{
- return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, NULL, timeout);
}
+#endif
COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
{
@@ -925,8 +914,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMMSG:
- ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
- compat_ptr(a[4]));
+ ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2],
+ a[3] | MSG_CMSG_COMPAT, NULL,
+ compat_ptr(a[4]));
break;
case SYS_ACCEPT4:
ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 57f3a6fcfc1e..b2651bb6d2a3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
}
EXPORT_SYMBOL(skb_kill_datagram);
-/**
- * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
- * @skb: buffer to copy
- * @offset: offset in the buffer to start copying from
- * @to: iovec iterator to copy to
- * @len: amount of data to copy from buffer to iovec
- */
-int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len)
+int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, bool fault_short,
+ size_t (*cb)(const void *, size_t, void *, struct iov_iter *),
+ void *data)
{
int start = skb_headlen(skb);
int i, copy = start - offset, start_off = offset, n;
struct sk_buff *frag_iter;
- trace_skb_copy_datagram_iovec(skb, len);
-
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
- n = copy_to_iter(skb->data + offset, copy, to);
+ n = cb(skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ struct page *page = skb_frag_page(frag);
+ u8 *vaddr = kmap(page);
+
if (copy > len)
copy = len;
- n = copy_page_to_iter(skb_frag_page(frag),
- frag->page_offset + offset -
- start, copy, to);
+ n = cb(vaddr + frag->page_offset +
+ offset - start, copy, data, to);
+ kunmap(page);
offset += n;
if (n != copy)
goto short_copy;
@@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_iter(frag_iter, offset - start,
- to, copy))
+ if (__skb_datagram_iter(frag_iter, offset - start,
+ to, copy, fault_short, cb, data))
goto fault;
if ((len -= copy) == 0)
return 0;
@@ -490,11 +486,50 @@ fault:
return -EFAULT;
short_copy:
- if (iov_iter_count(to))
+ if (fault_short || iov_iter_count(to))
goto fault;
return 0;
}
+
+/**
+ * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
+ * and update a hash.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @hash: hash request to update
+ */
+int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len,
+ struct ahash_request *hash)
+{
+ return __skb_datagram_iter(skb, offset, to, len, true,
+ hash_and_copy_to_iter, hash);
+}
+EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
+
+static size_t simple_copy_to_iter(const void *addr, size_t bytes,
+ void *data __always_unused, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+/**
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
+{
+ trace_skb_copy_datagram_iovec(skb, len);
+ return __skb_datagram_iter(skb, offset, to, len, false,
+ simple_copy_to_iter, NULL);
+}
EXPORT_SYMBOL(skb_copy_datagram_iter);
/**
@@ -645,131 +680,22 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
+/**
+ * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
+ * and update a checksum.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @csump: checksum pointer
+ */
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
- int start = skb_headlen(skb);
- int i, copy = start - offset, start_off = offset;
- struct sk_buff *frag_iter;
- int pos = 0;
- int n;
-
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
- offset += n;
- if (n != copy)
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- pos = copy;
- }
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- WARN_ON(start > offset + len);
-
- end = start + skb_frag_size(frag);
- if ((copy = end - offset) > 0) {
- __wsum csum2 = 0;
- struct page *page = skb_frag_page(frag);
- u8 *vaddr = kmap(page);
-
- if (copy > len)
- copy = len;
- n = csum_and_copy_to_iter(vaddr + frag->page_offset +
- offset - start, copy,
- &csum2, to);
- kunmap(page);
- offset += n;
- if (n != copy)
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if (!(len -= copy))
- return 0;
- pos += copy;
- }
- start = end;
- }
-
- skb_walk_frags(skb, frag_iter) {
- int end;
-
- WARN_ON(start > offset + len);
-
- end = start + frag_iter->len;
- if ((copy = end - offset) > 0) {
- __wsum csum2 = 0;
- if (copy > len)
- copy = len;
- if (skb_copy_and_csum_datagram(frag_iter,
- offset - start,
- to, copy,
- &csum2))
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- pos += copy;
- }
- start = end;
- }
- if (!len)
- return 0;
-
-fault:
- iov_iter_revert(to, offset - start_off);
- return -EFAULT;
-}
-
-__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
-{
- __sum16 sum;
-
- sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
- if (!skb_shared(skb))
- skb->csum_valid = !sum;
- return sum;
-}
-EXPORT_SYMBOL(__skb_checksum_complete_head);
-
-__sum16 __skb_checksum_complete(struct sk_buff *skb)
-{
- __wsum csum;
- __sum16 sum;
-
- csum = skb_checksum(skb, 0, skb->len, 0);
-
- /* skb->csum holds pseudo checksum */
- sum = csum_fold(csum_add(skb->csum, csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
-
- if (!skb_shared(skb)) {
- /* Save full packet checksum */
- skb->csum = csum;
- skb->ip_summed = CHECKSUM_COMPLETE;
- skb->csum_complete_sw = 1;
- skb->csum_valid = !sum;
- }
-
- return sum;
+ return __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, csump);
}
-EXPORT_SYMBOL(__skb_checksum_complete);
/**
* skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
@@ -810,7 +736,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
- netdev_rx_csum_fault(NULL);
+ netdev_rx_csum_fault(NULL, skb);
}
return 0;
fault:
diff --git a/net/core/dev.c b/net/core/dev.c
index ddc551f24ba2..1b5a4410be0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -145,6 +145,7 @@
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
+#include <linux/indirect_call_wrapper.h>
#include "net-sysfs.h"
@@ -162,6 +163,9 @@ static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info);
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack);
static struct napi_struct *napi_by_id(unsigned int napi_id);
/*
@@ -1361,7 +1365,7 @@ void netdev_notify_peers(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_notify_peers);
-static int __dev_open(struct net_device *dev)
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
@@ -1377,7 +1381,7 @@ static int __dev_open(struct net_device *dev)
*/
netpoll_poll_disable(dev);
- ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
ret = notifier_to_errno(ret);
if (ret)
return ret;
@@ -1406,7 +1410,8 @@ static int __dev_open(struct net_device *dev)
/**
* dev_open - prepare an interface for use.
- * @dev: device to open
+ * @dev: device to open
+ * @extack: netlink extended ack
*
* Takes a device from down to up state. The device's private open
* function is invoked and then the multicast lists are loaded. Finally
@@ -1416,14 +1421,14 @@ static int __dev_open(struct net_device *dev)
* Calling this function on an active interface is a nop. On a failure
* a negative errno code is returned.
*/
-int dev_open(struct net_device *dev)
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
if (ret < 0)
return ret;
@@ -1585,6 +1590,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ N(PRE_CHANGEADDR)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -1733,6 +1739,18 @@ static int call_netdevice_notifiers_info(unsigned long val,
return raw_notifier_call_chain(&netdev_chain, val, info);
}
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ .extack = extack,
+ };
+
+ return call_netdevice_notifiers_info(val, &info);
+}
+
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
@@ -1744,11 +1762,7 @@ static int call_netdevice_notifiers_info(unsigned long val,
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- struct netdev_notifier_info info = {
- .dev = dev,
- };
-
- return call_netdevice_notifiers_info(val, &info);
+ return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);
@@ -2175,6 +2189,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
return active;
}
+static void reset_xps_maps(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ bool is_rxqs_map)
+{
+ if (is_rxqs_map) {
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ } else {
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+ }
+ static_key_slow_dec_cpuslocked(&xps_needed);
+ kfree_rcu(dev_maps, rcu);
+}
+
static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
struct xps_dev_maps *dev_maps, unsigned int nr_ids,
u16 offset, u16 count, bool is_rxqs_map)
@@ -2186,18 +2214,15 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
j < nr_ids;)
active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
count);
- if (!active) {
- if (is_rxqs_map) {
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+ if (!active)
+ reset_xps_maps(dev, dev_maps, is_rxqs_map);
- for (i = offset + (count - 1); count--; i--)
- netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
+ if (!is_rxqs_map) {
+ for (i = offset + (count - 1); count--; i--) {
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
}
- kfree_rcu(dev_maps, rcu);
}
}
@@ -2234,10 +2259,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
false);
out_no_maps:
- if (static_key_enabled(&xps_rxqs_needed))
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
-
- static_key_slow_dec_cpuslocked(&xps_needed);
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
@@ -2355,9 +2376,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
- static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
- static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ if (!dev_maps) {
+ /* Increment static keys at most once per type */
+ static_key_slow_inc_cpuslocked(&xps_needed);
+ if (is_rxqs_map)
+ static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ }
for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
j < nr_ids;) {
@@ -2455,13 +2479,8 @@ out_no_new_maps:
}
/* free map if not active */
- if (!active) {
- if (is_rxqs_map)
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- else
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- kfree_rcu(dev_maps, rcu);
- }
+ if (!active)
+ reset_xps_maps(dev, dev_maps, is_rxqs_map);
out_no_maps:
mutex_unlock(&xps_map_mutex);
@@ -3091,10 +3110,17 @@ EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
-void netdev_rx_csum_fault(struct net_device *dev)
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ if (dev)
+ pr_err("dev features: %pNF\n", &dev->features);
+ pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
+ skb->len, skb->data_len, skb->pkt_type,
+ skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
+ skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
dump_stack();
}
}
@@ -4520,9 +4546,14 @@ static int netif_rx_internal(struct sk_buff *skb)
int netif_rx(struct sk_buff *skb)
{
+ int ret;
+
trace_netif_rx_entry(skb);
- return netif_rx_internal(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(netif_rx);
@@ -4537,6 +4568,7 @@ int netif_rx_ni(struct sk_buff *skb)
if (local_softirq_pending())
do_softirq();
preempt_enable();
+ trace_netif_rx_ni_exit(err);
return err;
}
@@ -4889,7 +4921,7 @@ skip_classify:
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
@@ -5009,7 +5041,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
- list_del(&skb->list);
+ skb_list_del_init(skb);
__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
@@ -5165,7 +5197,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(netdev_tstamp_prequeue, skb);
- list_del(&skb->list);
+ skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
}
@@ -5176,7 +5208,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
rcu_read_lock();
list_for_each_entry_safe(skb, next, head, list) {
xdp_prog = rcu_dereference(skb->dev->xdp_prog);
- list_del(&skb->list);
+ skb_list_del_init(skb);
if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
list_add_tail(&skb->list, &sublist);
}
@@ -5195,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
if (cpu >= 0) {
/* Will be handled, remove from list */
- list_del(&skb->list);
+ skb_list_del_init(skb);
enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
}
}
@@ -5222,9 +5254,14 @@ static void netif_receive_skb_list_internal(struct list_head *head)
*/
int netif_receive_skb(struct sk_buff *skb)
{
+ int ret;
+
trace_netif_receive_skb_entry(skb);
- return netif_receive_skb_internal(skb);
+ ret = netif_receive_skb_internal(skb);
+ trace_netif_receive_skb_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -5244,9 +5281,12 @@ void netif_receive_skb_list(struct list_head *head)
if (list_empty(head))
return;
- list_for_each_entry(skb, head, list)
- trace_netif_receive_skb_list_entry(skb);
+ if (trace_netif_receive_skb_list_entry_enabled()) {
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ }
netif_receive_skb_list_internal(head);
+ trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);
@@ -5299,6 +5339,8 @@ static void flush_all_backlogs(void)
put_online_cpus();
}
+INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
static int napi_gro_complete(struct sk_buff *skb)
{
struct packet_offload *ptype;
@@ -5318,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb)
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
- err = ptype->callbacks.gro_complete(skb, 0);
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
break;
}
rcu_read_unlock();
@@ -5357,11 +5401,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
- u32 i;
+ unsigned long bitmask = napi->gro_bitmask;
+ unsigned int i, base = ~0U;
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- if (test_bit(i, &napi->gro_bitmask))
- __napi_gro_flush_chain(napi, i, flush_old);
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __napi_gro_flush_chain(napi, base, flush_old);
}
}
EXPORT_SYMBOL(napi_gro_flush);
@@ -5386,7 +5432,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
+ if (skb_vlan_tag_present(p))
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
@@ -5461,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head)
napi_gro_complete(oldest);
}
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
+ struct sk_buff *));
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@ -5510,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->csum_valid = 0;
}
- pp = ptype->callbacks.gro_receive(gro_head, skb);
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ gro_head, skb);
break;
}
rcu_read_unlock();
@@ -5634,12 +5688,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ gro_result_t ret;
+
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb);
- return napi_skb_finish(dev_gro_receive(napi, skb), skb);
+ ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(napi_gro_receive);
@@ -5652,7 +5711,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb->dev = napi->dev;
skb->skb_iif = 0;
@@ -5757,6 +5816,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
+ gro_result_t ret;
struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
@@ -5764,7 +5824,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
trace_napi_gro_frags_entry(skb);
- return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(napi_gro_frags);
@@ -5780,10 +5843,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
+ netdev_rx_csum_fault(skb->dev, skb);
}
NAPI_GRO_CB(skb)->csum = wsum;
@@ -6204,8 +6268,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
- pr_err_once("netif_napi_add() called with weight %d on device %s\n",
- weight, dev->name);
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
+ weight);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
@@ -7462,7 +7526,8 @@ unsigned int dev_get_flags(const struct net_device *dev)
}
EXPORT_SYMBOL(dev_get_flags);
-int __dev_change_flags(struct net_device *dev, unsigned int flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
unsigned int old_flags = dev->flags;
int ret;
@@ -7499,7 +7564,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
if (old_flags & IFF_UP)
__dev_close(dev);
else
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
}
if ((flags ^ dev->gflags) & IFF_PROMISC) {
@@ -7559,16 +7624,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
* dev_change_flags - change device settings
* @dev: device
* @flags: device state flags
+ * @extack: netlink extended ack
*
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
-int dev_change_flags(struct net_device *dev, unsigned int flags)
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
- ret = __dev_change_flags(dev, flags);
+ ret = __dev_change_flags(dev, flags, extack);
if (ret < 0)
return ret;
@@ -7701,13 +7768,36 @@ void dev_set_group(struct net_device *dev, int new_group)
EXPORT_SYMBOL(dev_set_group);
/**
+ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
+ */
+int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_pre_changeaddr_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .dev_addr = addr,
+ };
+ int rc;
+
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
+ return notifier_to_errno(rc);
+}
+EXPORT_SYMBOL(dev_pre_changeaddr_notify);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
+ * @extack: netlink extended ack
*
* Change the hardware (MAC) address of the device
*/
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
@@ -7718,6 +7808,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
+ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ if (err)
+ return err;
err = ops->ndo_set_mac_address(dev, sa);
if (err)
return err;
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index d884d8f5f0e5..a6723b306717 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
+ * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking
+ * into account references
+ * @list: address list to synchronize
+ * @dev: device to sync
+ * @sync: function to call if address or reference on it should be added
+ * @unsync: function to call if address or some reference on it should removed
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address or references on it
+ * add/remove notifications. The unsync function may be NULL in which case
+ * the addresses or references on it requiring removal will simply be
+ * removed without any notification to the device. That is responsibility of
+ * the driver to identify and distribute address or references on it between
+ * internal address tables.
+ **/
+int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *,
+ const unsigned char *, int),
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err, ref_cnt;
+
+ /* first go through and flush out any unsynced/stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address is not used */
+ if ((ha->sync_cnt << 1) <= ha->refcount)
+ continue;
+
+ /* if fails defer unsyncing address */
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ if (unsync && unsync(dev, ha->addr, ref_cnt))
+ continue;
+
+ ha->refcount = (ref_cnt << 1) + 1;
+ ha->sync_cnt = ref_cnt;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync updated/new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address added or reused */
+ if ((ha->sync_cnt << 1) >= ha->refcount)
+ continue;
+
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ err = sync(dev, ha->addr, ref_cnt);
+ if (err)
+ return err;
+
+ ha->refcount = ref_cnt << 1;
+ ha->sync_cnt = ref_cnt;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_ref_sync_dev);
+
+/**
+ * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on
+ * it from device
+ * @list: address list to remove synchronized addresses (references on it) from
+ * @dev: device to sync
+ * @unsync: function to call if address and references on it should be removed
+ *
+ * Remove all addresses that were added to the device by
+ * __hw_addr_ref_sync_dev(). This function is intended to be called from the
+ * ndo_stop or ndo_open functions on devices that require explicit address (or
+ * references on it) add/remove notifications. If the unsync function pointer
+ * is NULL then this function can be used to just reset the sync_cnt for the
+ * addresses in the list.
+ **/
+void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr, ha->sync_cnt))
+ continue;
+
+ ha->refcount -= ha->sync_cnt - 1;
+ ha->sync_cnt = 0;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_ref_unsync_dev);
+
+/**
* __hw_addr_unsync_dev - Remove synchronized addresses from device
* @list: address list to remove synchronized addresses from
* @dev: device to sync
@@ -401,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr,
ASSERT_RTNL();
+ err = dev_pre_changeaddr_notify(dev, addr, NULL);
+ if (err)
+ return err;
err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
if (!err)
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 90e8aa36881e..31380fd5a4e2 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
switch (cmd) {
case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
+ return dev_change_flags(dev, ifr->ifr_flags, NULL);
case SIOCSIFMETRIC: /* Set the metric on the interface
(currently unused) */
@@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFHWADDR:
if (dev->addr_len > sizeof(struct sockaddr))
return -EINVAL;
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3a4b29a13d31..abb0da9d7b4b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
+ .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
+ .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
diff --git a/net/core/filter.c b/net/core/filter.c
index e521c5ebc7d1..447dd1bad31f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
break;
case SKF_AD_VLAN_TAG:
- case SKF_AD_VLAN_TAG_PRESENT:
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
/* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sk_buff, vlan_tci));
- if (skb_field == SKF_AD_VLAN_TAG) {
- *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
- ~VLAN_TAG_PRESENT);
- } else {
- /* dst_reg >>= 12 */
- *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
- /* dst_reg &= 1 */
+ break;
+ case SKF_AD_VLAN_TAG_PRESENT:
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
+ if (PKT_VLAN_PRESENT_BIT)
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
+ if (PKT_VLAN_PRESENT_BIT < 7)
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
- }
break;
}
@@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
bool ldx_off_ok = offset <= S16_MAX;
*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
- *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ if (offset)
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
size, 2 + endian + (!ldx_off_ok * 2));
if (ldx_off_ok) {
@@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+ int prev;
+
+ do {
+ prev = i;
+ sk_msg_iter_var_next(i);
+ msg->sg.data[prev] = msg->sg.data[i];
+ } while (i != msg->sg.end);
+
+ sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+ struct scatterlist tmp, sge;
+
+ sk_msg_iter_next(msg, end);
+ sge = sk_msg_elem_cpy(msg, i);
+ sk_msg_iter_var_next(i);
+ tmp = sk_msg_elem_cpy(msg, i);
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sk_msg_iter_var_next(i);
+ sge = tmp;
+ tmp = sk_msg_elem_cpy(msg, i);
+ }
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ u32 i = 0, l, space, offset = 0;
+ u64 last = start + len;
+ int pop;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ offset += l;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ /* Bounds checks: start and pop must be inside message */
+ if (start >= offset + l || last >= msg->sg.size)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ pop = len;
+ /* --------------| offset
+ * -| start |-------- len -------|
+ *
+ * |----- a ----|-------- pop -------|----- b ----|
+ * |______________________________________________| length
+ *
+ *
+ * a: region at front of scatter element to save
+ * b: region at back of scatter element to save when length > A + pop
+ * pop: region to pop from element, same as input 'pop' here will be
+ * decremented below per iteration.
+ *
+ * Two top-level cases to handle when start != offset, first B is non
+ * zero and second B is zero corresponding to when a pop includes more
+ * than one element.
+ *
+ * Then if B is non-zero AND there is no space allocate space and
+ * compact A, B regions into page. If there is space shift ring to
+ * the rigth free'ing the next element in ring to place B, leaving
+ * A untouched except to reduce length.
+ */
+ if (start != offset) {
+ struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ int a = start;
+ int b = sge->length - pop - a;
+
+ sk_msg_iter_var_next(i);
+
+ if (pop < sge->length - a) {
+ if (space) {
+ sge->length = a;
+ sk_msg_shift_right(msg, i);
+ nsge = sk_msg_elem(msg, i);
+ get_page(sg_page(sge));
+ sg_set_page(nsge,
+ sg_page(sge),
+ b, sge->offset + pop + a);
+ } else {
+ struct page *page, *orig;
+ u8 *to, *from;
+
+ page = alloc_pages(__GFP_NOWARN |
+ __GFP_COMP | GFP_ATOMIC,
+ get_order(a + b));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ sge->length = a;
+ orig = sg_page(sge);
+ from = sg_virt(sge);
+ to = page_address(page);
+ memcpy(to, from, a);
+ memcpy(to + a, from + a + pop, b);
+ sg_set_page(sge, page, a + b, 0);
+ put_page(orig);
+ }
+ pop = 0;
+ } else if (pop >= sge->length - a) {
+ sge->length = a;
+ pop -= (sge->length - a);
+ }
+ }
+
+ /* From above the current layout _must_ be as follows,
+ *
+ * -| offset
+ * -| start
+ *
+ * |---- pop ---|---------------- b ------------|
+ * |____________________________________________| length
+ *
+ * Offset and start of the current msg elem are equal because in the
+ * previous case we handled offset != start and either consumed the
+ * entire element and advanced to the next element OR pop == 0.
+ *
+ * Two cases to handle here are first pop is less than the length
+ * leaving some remainder b above. Simply adjust the element's layout
+ * in this case. Or pop >= length of the element so that b = 0. In this
+ * case advance to next element decrementing pop.
+ */
+ while (pop) {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (pop < sge->length) {
+ sge->length -= pop;
+ sge->offset += pop;
+ pop = 0;
+ } else {
+ pop -= sge->length;
+ sk_msg_shift_left(msg, i);
+ }
+ sk_msg_iter_var_next(i);
+ }
+
+ sk_mem_uncharge(msg->sk, len - pop);
+ msg->sg.size -= (len - pop);
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+ .func = bpf_msg_pop_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
+ .func = bpf_sockopt_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
@@ -4825,47 +5010,40 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
- struct sk_buff *skb, u8 family, u8 proto)
+ int dif, int sdif, u8 family, u8 proto)
{
bool refcounted = false;
struct sock *sk = NULL;
- int dif = 0;
-
- if (skb->dev)
- dif = skb->dev->ifindex;
if (family == AF_INET) {
__be32 src4 = tuple->ipv4.saddr;
__be32 dst4 = tuple->ipv4.daddr;
- int sdif = inet_sdif(skb);
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+ sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
else
sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
- dif, sdif, &udp_table, skb);
+ dif, sdif, &udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
- u16 hnum = ntohs(tuple->ipv6.dport);
- int sdif = inet6_sdif(skb);
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+ sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
src6, tuple->ipv6.sport,
- dst6, hnum,
+ dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
else if (likely(ipv6_bpf_stub))
sk = ipv6_bpf_stub->udp6_lib_lookup(net,
src6, tuple->ipv6.sport,
- dst6, hnum,
+ dst6, tuple->ipv6.dport,
dif, sdif,
- &udp_table, skb);
+ &udp_table, NULL);
#endif
}
@@ -4882,31 +5060,34 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
* callers to satisfy BPF_CALL declarations.
*/
static unsigned long
-bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
- u8 proto, u64 netns_id, u64 flags)
+__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+ u64 flags)
{
- struct net *caller_net;
struct sock *sk = NULL;
u8 family = AF_UNSPEC;
struct net *net;
+ int sdif;
family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
- if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+ if (unlikely(family == AF_UNSPEC || flags ||
+ !((s32)netns_id < 0 || netns_id <= S32_MAX)))
goto out;
- if (skb->dev)
- caller_net = dev_net(skb->dev);
+ if (family == AF_INET)
+ sdif = inet_sdif(skb);
else
- caller_net = sock_net(skb->sk);
- if (netns_id) {
+ sdif = inet6_sdif(skb);
+
+ if ((s32)netns_id < 0) {
+ net = caller_net;
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
+ } else {
net = get_net_ns_by_id(caller_net, netns_id);
if (unlikely(!net))
goto out;
- sk = sk_lookup(net, tuple, skb, family, proto);
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
put_net(net);
- } else {
- net = caller_net;
- sk = sk_lookup(net, tuple, skb, family, proto);
}
if (sk)
@@ -4915,6 +5096,25 @@ out:
return (unsigned long) sk;
}
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct net *caller_net;
+ int ifindex;
+
+ if (skb->dev) {
+ caller_net = dev_net(skb->dev);
+ ifindex = skb->dev->ifindex;
+ } else {
+ caller_net = sock_net(skb->sk);
+ ifindex = 0;
+ }
+
+ return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex,
+ proto, netns_id, flags);
+}
+
BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
@@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net *caller_net = dev_net(ctx->rxq->dev);
+ int ifindex = ctx->rxq->dev->ifindex;
+
+ return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
+ IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
+ .func = bpf_xdp_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net *caller_net = dev_net(ctx->rxq->dev);
+ int ifindex = ctx->rxq->dev->ifindex;
+
+ return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
+ IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
+ .func = bpf_xdp_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
+ IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
+ .func = bpf_sock_addr_sk_lookup_tcp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
+ IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
+ .func = bpf_sock_addr_sk_lookup_udp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_xdp_adjust_meta ||
func == bpf_msg_pull_data ||
func == bpf_msg_push_data ||
+ func == bpf_msg_pop_data ||
func == bpf_xdp_adjust_tail ||
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
func == bpf_lwt_seg6_store_bytes ||
@@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_sock_addr_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sock_addr_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sock_addr_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+#endif /* CONFIG_INET */
default:
return bpf_base_func_proto(func_id);
}
@@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_adjust_tail_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_xdp_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_xdp_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_sock_ops_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_sockopt_event_output_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_pull_data_proto;
case BPF_FUNC_msg_push_data:
return &bpf_msg_push_data_proto;
+ case BPF_FUNC_msg_pop_data:
+ return &bpf_msg_pop_data_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5436,8 +5738,12 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != size_default)
return false;
break;
- case bpf_ctx_range(struct __sk_buff, flow_keys):
- if (size != sizeof(struct bpf_flow_keys *))
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (size != sizeof(__u64))
return false;
break;
default:
@@ -5465,8 +5771,10 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
return false;
}
@@ -5490,7 +5798,8 @@ static bool cg_skb_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
@@ -5505,6 +5814,10 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, priority):
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (!capable(CAP_SYS_ADMIN))
+ return false;
+ break;
default:
return false;
}
@@ -5531,7 +5844,9 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
return false;
}
@@ -5741,6 +6056,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, priority):
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
break;
default:
return false;
@@ -5757,7 +6073,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -5959,7 +6275,9 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
return false;
}
@@ -5995,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size,
if (type == BPF_WRITE)
return false;
+ if (off % size != 0)
+ return false;
+
switch (off) {
case offsetof(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
@@ -6006,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
- default:
+ case bpf_ctx_range(struct sk_msg_md, family):
+ case bpf_ctx_range(struct sk_msg_md, remote_ip4):
+ case bpf_ctx_range(struct sk_msg_md, local_ip4):
+ case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct sk_msg_md, remote_port):
+ case bpf_ctx_range(struct sk_msg_md, local_port):
+ case bpf_ctx_range(struct sk_msg_md, size):
if (size != sizeof(__u32))
return false;
- }
-
- if (off < 0 || off >= sizeof(struct sk_msg_md))
- return false;
- if (off % size != 0)
+ break;
+ default:
return false;
-
+ }
return true;
}
@@ -6040,12 +6365,14 @@ static bool flow_dissector_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
- case bpf_ctx_range(struct __sk_buff, flow_keys):
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
info->reg_type = PTR_TO_FLOW_KEYS;
break;
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
return false;
}
@@ -6140,19 +6467,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, vlan_present):
- case offsetof(struct __sk_buff, vlan_tci):
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+ *target_size = 1;
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+ PKT_VLAN_PRESENT_OFFSET());
+ if (PKT_VLAN_PRESENT_BIT)
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
+ if (PKT_VLAN_PRESENT_BIT < 7)
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+ break;
+ case offsetof(struct __sk_buff, vlan_tci):
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff, vlan_tci, 2,
target_size));
- if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
- ~VLAN_TAG_PRESENT);
- } else {
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
- }
break;
case offsetof(struct __sk_buff, cb[0]) ...
@@ -6355,6 +6682,33 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
si->src_reg, off);
break;
+
+ case offsetof(struct __sk_buff, tstamp):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_DW,
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ tstamp, 8,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_DW,
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ tstamp, 8,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, wire_len):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, wire_len);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, pkt_len);
+ *target_size = 4;
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
}
return insn - insn_buf;
@@ -7071,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
int off;
#endif
+ /* convert ctx uses the fact sg element is first in struct */
+ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
+
switch (si->off) {
case offsetof(struct sk_msg_md, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
@@ -7183,6 +7540,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num));
break;
+
+ case offsetof(struct sk_msg_md, size):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_sg, size));
+ break;
}
return insn - insn_buf;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 588f475019d4..9f2840510e63 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -783,6 +783,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
/* Pass parameters to the BPF program */
cb->qdisc_cb.flow_keys = &flow_keys;
flow_keys.nhoff = nhoff;
+ flow_keys.thoff = nhoff;
bpf_compute_data_pointers((struct sk_buff *)skb);
result = BPF_PROG_RUN(attached, skb);
@@ -790,9 +791,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
/* Restore state */
memcpy(cb, &cb_saved, sizeof(cb_saved));
+ flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len);
+ flow_keys.thoff = clamp_t(u16, flow_keys.thoff,
+ flow_keys.nhoff, skb->len);
+
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
- key_control->thoff = min_t(u16, key_control->thoff, skb->len);
rcu_read_unlock();
return result == BPF_OK;
}
@@ -952,8 +956,7 @@ proto_again:
if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
- key_vlan->vlan_priority =
- (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
} else {
key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
VLAN_VID_MASK;
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index 4b54e5f107c6..acf45ddbe924 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -84,6 +84,7 @@ void gro_cells_destroy(struct gro_cells *gcells)
for_each_possible_cpu(i) {
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+ napi_disable(&cell->napi);
netif_napi_del(&cell->napi);
__skb_queue_purge(&cell->napi_skbs);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 41954e42a2de..763a7b08df67 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -118,21 +118,77 @@ unsigned long neigh_rand_reach_time(unsigned long base)
}
EXPORT_SYMBOL(neigh_rand_reach_time);
+static void neigh_mark_dead(struct neighbour *n)
+{
+ n->dead = 1;
+ if (!list_empty(&n->gc_list)) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ }
+}
+
+static void neigh_update_gc_list(struct neighbour *n)
+{
+ bool on_gc_list, exempt_from_gc;
+
+ write_lock_bh(&n->tbl->lock);
+ write_lock(&n->lock);
+
+ /* remove from the gc list if new state is permanent or if neighbor
+ * is externally learned; otherwise entry should be on the gc list
+ */
+ exempt_from_gc = n->nud_state & NUD_PERMANENT ||
+ n->flags & NTF_EXT_LEARNED;
+ on_gc_list = !list_empty(&n->gc_list);
+
+ if (exempt_from_gc && on_gc_list) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ } else if (!exempt_from_gc && !on_gc_list) {
+ /* add entries to the tail; cleaning removes from the front */
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+ atomic_inc(&n->tbl->gc_entries);
+ }
+
+ write_unlock(&n->lock);
+ write_unlock_bh(&n->tbl->lock);
+}
-static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
- struct neighbour __rcu **np, struct neigh_table *tbl)
+static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags,
+ int *notify)
+{
+ bool rc = false;
+ u8 ndm_flags;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN))
+ return rc;
+
+ ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+ if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+ if (ndm_flags & NTF_EXT_LEARNED)
+ neigh->flags |= NTF_EXT_LEARNED;
+ else
+ neigh->flags &= ~NTF_EXT_LEARNED;
+ rc = true;
+ *notify = 1;
+ }
+
+ return rc;
+}
+
+static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
+ struct neigh_table *tbl)
{
bool retval = false;
write_lock(&n->lock);
- if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) &&
- !(n->flags & flags)) {
+ if (refcount_read(&n->refcnt) == 1) {
struct neighbour *neigh;
neigh = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(*np, neigh);
- n->dead = 1;
+ neigh_mark_dead(n);
retval = true;
}
write_unlock(&n->lock);
@@ -158,7 +214,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock)))) {
if (n == ndel)
- return neigh_del(n, 0, 0, np, tbl);
+ return neigh_del(n, np, tbl);
np = &n->next;
}
return false;
@@ -166,32 +222,29 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
static int neigh_forced_gc(struct neigh_table *tbl)
{
+ int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
+ unsigned long tref = jiffies - 5 * HZ;
+ struct neighbour *n, *tmp;
int shrunk = 0;
- int i;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour *n;
- struct neighbour __rcu **np;
- np = &nht->hash_buckets[i];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
- /* Neighbour record may be discarded if:
- * - nobody refers to it.
- * - it is not permanent
- */
- if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np,
- tbl)) {
- shrunk = 1;
- continue;
- }
- np = &n->next;
+ list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
+ if (refcount_read(&n->refcnt) == 1) {
+ bool remove = false;
+
+ write_lock(&n->lock);
+ if ((n->nud_state == NUD_FAILED) ||
+ time_after(tref, n->updated))
+ remove = true;
+ write_unlock(&n->lock);
+
+ if (remove && neigh_remove_one(n, tbl))
+ shrunk++;
+ if (shrunk >= max_clean)
+ break;
}
}
@@ -260,8 +313,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
- n->dead = 1;
-
+ neigh_mark_dead(n);
if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
@@ -321,13 +373,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl,
+ struct net_device *dev,
+ bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
- entries = atomic_inc_return(&tbl->entries) - 1;
+ if (exempt_from_gc)
+ goto do_alloc;
+
+ entries = atomic_inc_return(&tbl->gc_entries) - 1;
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
@@ -340,6 +397,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
}
}
+do_alloc:
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
@@ -358,11 +416,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
n->tbl = tbl;
refcount_set(&n->refcnt, 1);
n->dead = 1;
+ INIT_LIST_HEAD(&n->gc_list);
+
+ atomic_inc(&tbl->entries);
out:
return n;
out_entries:
- atomic_dec(&tbl->entries);
+ if (!exempt_from_gc)
+ atomic_dec(&tbl->gc_entries);
goto out;
}
@@ -505,13 +567,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
}
EXPORT_SYMBOL(neigh_lookup_nodev);
-struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev, bool want_ref)
+static struct neighbour *___neigh_create(struct neigh_table *tbl,
+ const void *pkey,
+ struct net_device *dev,
+ bool exempt_from_gc, bool want_ref)
{
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);
u32 hash_val;
unsigned int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
@@ -574,6 +638,9 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
}
n->dead = 0;
+ if (!exempt_from_gc)
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
@@ -591,6 +658,12 @@ out_neigh_release:
neigh_release(n);
goto out;
}
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
+{
+ return ___neigh_create(tbl, pkey, dev, false, want_ref);
+}
EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, unsigned int key_len)
@@ -652,6 +725,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
if (!n)
goto out;
+ n->protocol = 0;
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
@@ -854,7 +928,7 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
- n->dead = 1;
+ neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
@@ -1137,9 +1211,11 @@ static void neigh_update_hhs(struct neighbour *neigh)
Caller MUST hold reference count on the entry.
*/
-int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- u32 flags, u32 nlmsg_pid)
+static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
+ u8 new, u32 flags, u32 nlmsg_pid,
+ struct netlink_ext_ack *extack)
{
+ bool ext_learn_change = false;
u8 old;
int err;
int notify = 0;
@@ -1155,10 +1231,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
- if (neigh->dead)
+ if (neigh->dead) {
+ NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
goto out;
+ }
- neigh_update_ext_learned(neigh, flags, &notify);
+ ext_learn_change = neigh_update_ext_learned(neigh, flags, &notify);
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
@@ -1193,8 +1271,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
use it, otherwise discard the request.
*/
err = -EINVAL;
- if (!(old & NUD_VALID))
+ if (!(old & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack, "No link layer address given");
goto out;
+ }
lladdr = neigh->ha;
}
@@ -1302,11 +1382,20 @@ out:
neigh_update_is_router(neigh, flags, &notify);
write_unlock_bh(&neigh->lock);
+ if (((new ^ old) & NUD_PERMANENT) || ext_learn_change)
+ neigh_update_gc_list(neigh);
+
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
return err;
}
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags, u32 nlmsg_pid)
+{
+ return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
+}
EXPORT_SYMBOL(neigh_update);
/* Update the neigh to listen temporarily for probe responses, even if it is
@@ -1571,6 +1660,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
unsigned long phsize;
INIT_LIST_HEAD(&tbl->parms_list);
+ INIT_LIST_HEAD(&tbl->gc_list);
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1);
@@ -1662,6 +1752,19 @@ static struct neigh_table *neigh_find_table(int family)
return tbl;
}
+const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
+ [NDA_PROBES] = { .type = NLA_U32 },
+ [NDA_VLAN] = { .type = NLA_U16 },
+ [NDA_PORT] = { .type = NLA_U16 },
+ [NDA_VNI] = { .type = NLA_U32 },
+ [NDA_IFINDEX] = { .type = NLA_U32 },
+ [NDA_MASTER] = { .type = NLA_U32 },
+ [NDA_PROTOCOL] = { .type = NLA_U8 },
+};
+
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -1678,8 +1781,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
- if (dst_attr == NULL)
+ if (!dst_attr) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
@@ -1694,8 +1799,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tbl == NULL)
return -EAFNOSUPPORT;
- if (nla_len(dst_attr) < (int)tbl->key_len)
+ if (nla_len(dst_attr) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
goto out;
+ }
if (ndm->ndm_flags & NTF_PROXY) {
err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
@@ -1711,10 +1818,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN,
- NETLINK_CB(skb).portid);
+ err = __neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
+ NETLINK_CB(skb).portid, extack);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh, tbl);
@@ -1736,16 +1842,19 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev = NULL;
struct neighbour *neigh;
void *dst, *lladdr;
+ u8 protocol = 0;
int err;
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack);
if (err < 0)
goto out;
err = -EINVAL;
- if (tb[NDA_DST] == NULL)
+ if (!tb[NDA_DST]) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
@@ -1755,19 +1864,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) {
+ NL_SET_ERR_MSG(extack, "Invalid link address");
goto out;
+ }
}
tbl = neigh_find_table(ndm->ndm_family);
if (tbl == NULL)
return -EAFNOSUPPORT;
- if (nla_len(tb[NDA_DST]) < (int)tbl->key_len)
+ if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
goto out;
+ }
+
dst = nla_data(tb[NDA_DST]);
lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+ if (tb[NDA_PROTOCOL])
+ protocol = nla_get_u8(tb[NDA_PROTOCOL]);
+
if (ndm->ndm_flags & NTF_PROXY) {
struct pneigh_entry *pn;
@@ -1775,22 +1892,30 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
pn = pneigh_lookup(tbl, net, dst, dev, 1);
if (pn) {
pn->flags = ndm->ndm_flags;
+ if (protocol)
+ pn->protocol = protocol;
err = 0;
}
goto out;
}
- if (dev == NULL)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device not specified");
goto out;
+ }
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
+ bool exempt_from_gc;
+
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
- neigh = __neigh_lookup_errno(tbl, dst, dev);
+ exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
+ ndm->ndm_flags & NTF_EXT_LEARNED;
+ neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
@@ -1817,8 +1942,12 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
neigh_event_send(neigh, NULL);
err = 0;
} else
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags,
- NETLINK_CB(skb).portid);
+ err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid, extack);
+
+ if (protocol)
+ neigh->protocol = protocol;
+
neigh_release(neigh);
out:
@@ -2312,6 +2441,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
+ if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2343,6 +2475,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
goto nla_put_failure;
+ if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2494,16 +2629,21 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
- ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) {
+ ndm->ndm_state || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
return -EINVAL;
}
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request");
+ return -EINVAL;
+ }
+
err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
- NULL, extack);
+ nda_policy, extack);
} else {
err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
- NULL, extack);
+ nda_policy, extack);
}
if (err < 0)
return err;
@@ -2515,17 +2655,9 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
/* all new attributes should require strict_check */
switch (i) {
case NDA_IFINDEX:
- if (nla_len(tb[i]) != sizeof(u32)) {
- NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request");
- return -EINVAL;
- }
filter->dev_idx = nla_get_u32(tb[i]);
break;
case NDA_MASTER:
- if (nla_len(tb[i]) != sizeof(u32)) {
- NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request");
- return -EINVAL;
- }
filter->master_idx = nla_get_u32(tb[i]);
break;
default:
@@ -2585,6 +2717,186 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+static int neigh_valid_get_req(const struct nlmsghdr *nlh,
+ struct neigh_table **tbl,
+ void **dst, int *dev_idx, u8 *ndm_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ struct ndmsg *ndm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
+ return -EINVAL;
+ }
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
+ nda_policy, extack);
+ if (err < 0)
+ return err;
+
+ *ndm_flags = ndm->ndm_flags;
+ *dev_idx = ndm->ndm_ifindex;
+ *tbl = neigh_find_table(ndm->ndm_family);
+ if (*tbl == NULL) {
+ NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request");
+ return -EAFNOSUPPORT;
+ }
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_DST:
+ if (nla_len(tb[i]) != (int)(*tbl)->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request");
+ return -EINVAL;
+ }
+ *dst = nla_data(tb[i]);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4) /* NDA_PROBES */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static int neigh_get_reply(struct net *net, struct neighbour *neigh,
+ u32 pid, u32 seq)
+{
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0);
+ if (err) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ err = rtnl_unicast(skb, net, pid);
+errout:
+ return err;
+}
+
+static inline size_t pneigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh,
+ u32 pid, u32 seq, struct neigh_table *tbl)
+{
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl);
+ if (err) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ err = rtnl_unicast(skb, net, pid);
+errout:
+ return err;
+}
+
+static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct net_device *dev = NULL;
+ struct neigh_table *tbl = NULL;
+ struct neighbour *neigh;
+ void *dst = NULL;
+ u8 ndm_flags = 0;
+ int dev_idx = 0;
+ int err;
+
+ err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags,
+ extack);
+ if (err < 0)
+ return err;
+
+ if (dev_idx) {
+ dev = __dev_get_by_index(net, dev_idx);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ return -ENODEV;
+ }
+ }
+
+ if (!dst) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
+ return -EINVAL;
+ }
+
+ if (ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ pn = pneigh_lookup(tbl, net, dst, dev, 0);
+ if (!pn) {
+ NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found");
+ return -ENOENT;
+ }
+ return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, tbl);
+ }
+
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return -EINVAL;
+ }
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (!neigh) {
+ NL_SET_ERR_MSG(extack, "Neighbour entry not found");
+ return -ENOENT;
+ }
+
+ err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq);
+
+ neigh_release(neigh);
+
+ return err;
+}
+
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
@@ -2631,7 +2943,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
- n->dead = 1;
+ neigh_mark_dead(n);
} else
np = &n->next;
write_unlock(&n->lock);
@@ -2992,15 +3304,6 @@ static const struct seq_operations neigh_stat_seq_ops = {
};
#endif /* CONFIG_PROC_FS */
-static inline size_t neigh_nlmsg_size(void)
-{
- return NLMSG_ALIGN(sizeof(struct ndmsg))
- + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
- + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
- + nla_total_size(sizeof(struct nda_cacheinfo))
- + nla_total_size(4); /* NDA_PROBES */
-}
-
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
@@ -3384,7 +3687,7 @@ static int __init neigh_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0);
rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
0);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bd67c4d0fcfd..ff9fd2bb4ce4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec);
static int change_flags(struct net_device *dev, unsigned long new_flags)
{
- return dev_change_flags(dev, (unsigned int)new_flags);
+ return dev_change_flags(dev, (unsigned int)new_flags, NULL);
}
static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index fefe72774aeb..b02fb19df2cc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
[NETNSA_NSID] = { .type = NLA_S32 },
[NETNSA_PID] = { .type = NLA_U32 },
[NETNSA_FD] = { .type = NLA_U32 },
+ [NETNSA_TARGET_NSID] = { .type = NLA_S32 },
};
static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -735,23 +736,38 @@ static int rtnl_net_get_size(void)
{
return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
;
}
-static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
- int cmd, struct net *net, int nsid)
+struct net_fill_args {
+ u32 portid;
+ u32 seq;
+ int flags;
+ int cmd;
+ int nsid;
+ bool add_ref;
+ int ref_nsid;
+};
+
+static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
- nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
+ args->flags);
if (!nlh)
return -EMSGSIZE;
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;
- if (nla_put_s32(skb, NETNSA_NSID, nsid))
+ if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
+ goto nla_put_failure;
+
+ if (args->add_ref &&
+ nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ struct net_fill_args fillargs = {
+ .portid = NETLINK_CB(skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .cmd = RTM_NEWNSID,
+ };
+ struct net *peer, *target = net;
struct nlattr *nla;
struct sk_buff *msg;
- struct net *peer;
- int err, id;
+ int err;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy, extack);
@@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
} else if (tb[NETNSA_FD]) {
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
nla = tb[NETNSA_FD];
+ } else if (tb[NETNSA_NSID]) {
+ peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID]));
+ if (!peer)
+ peer = ERR_PTR(-ENOENT);
+ nla = tb[NETNSA_NSID];
} else {
NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
return -EINVAL;
@@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(peer);
}
+ if (tb[NETNSA_TARGET_NSID]) {
+ int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
+
+ target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
+ if (IS_ERR(target)) {
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
+ NL_SET_ERR_MSG(extack,
+ "Target netns reference is invalid");
+ err = PTR_ERR(target);
+ goto out;
+ }
+ fillargs.add_ref = true;
+ fillargs.ref_nsid = peernet2id(net, peer);
+ }
+
msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
if (!msg) {
err = -ENOMEM;
goto out;
}
- id = peernet2id(net, peer);
- err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- RTM_NEWNSID, net, id);
+ fillargs.nsid = peernet2id(target, peer);
+ err = rtnl_net_fill(msg, &fillargs);
if (err < 0)
goto err_out;
@@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
err_out:
nlmsg_free(msg);
out:
+ if (fillargs.add_ref)
+ put_net(target);
put_net(peer);
return err;
}
struct rtnl_net_dump_cb {
- struct net *net;
+ struct net *tgt_net;
+ struct net *ref_net;
struct sk_buff *skb;
- struct netlink_callback *cb;
+ struct net_fill_args fillargs;
int idx;
int s_idx;
};
@@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
if (net_cb->idx < net_cb->s_idx)
goto cont;
- ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
- net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWNSID, net_cb->net, id);
+ net_cb->fillargs.nsid = id;
+ if (net_cb->fillargs.add_ref)
+ net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
+ ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
if (ret < 0)
return ret;
@@ -842,33 +886,96 @@ cont:
return 0;
}
+static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
+ struct rtnl_net_dump_cb *net_cb,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[NETNSA_MAX + 1];
+ int err, i;
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+ rtnl_net_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NETNSA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ if (i == NETNSA_TARGET_NSID) {
+ struct net *net;
+
+ net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
+ if (IS_ERR(net)) {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ net_cb->fillargs.add_ref = true;
+ net_cb->ref_net = net_cb->tgt_net;
+ net_cb->tgt_net = net;
+ } else {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net *net = sock_net(skb->sk);
struct rtnl_net_dump_cb net_cb = {
- .net = net,
+ .tgt_net = sock_net(skb->sk),
.skb = skb,
- .cb = cb,
+ .fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .cmd = RTM_NEWNSID,
+ },
.idx = 0,
.s_idx = cb->args[0],
};
+ int err = 0;
- if (cb->strict_check &&
- nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) {
- NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request");
- return -EINVAL;
+ if (cb->strict_check) {
+ err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
+ if (err < 0)
+ goto end;
}
- spin_lock_bh(&net->nsid_lock);
- idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- spin_unlock_bh(&net->nsid_lock);
+ spin_lock_bh(&net_cb.tgt_net->nsid_lock);
+ if (net_cb.fillargs.add_ref &&
+ !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
+ !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
+ spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
+ err = -EAGAIN;
+ goto end;
+ }
+ idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ if (net_cb.fillargs.add_ref &&
+ !net_eq(net_cb.ref_net, net_cb.tgt_net))
+ spin_unlock_bh(&net_cb.ref_net->nsid_lock);
+ spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
cb->args[0] = net_cb.idx;
- return skb->len;
+end:
+ if (net_cb.fillargs.add_ref)
+ put_net(net_cb.tgt_net);
+ return err < 0 ? err : skb->len;
}
static void rtnl_net_notifyid(struct net *net, int cmd, int id)
{
+ struct net_fill_args fillargs = {
+ .cmd = cmd,
+ .nsid = id,
+ };
struct sk_buff *msg;
int err = -ENOMEM;
@@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
if (!msg)
goto out;
- err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
+ err = rtnl_net_fill(msg, &fillargs);
if (err < 0)
goto err_out;
@@ -917,7 +1024,8 @@ static int __init net_ns_init(void)
init_net_initialized = true;
up_write(&pernet_ops_rwsem);
- register_pernet_subsys(&net_ns_ops);
+ if (register_pernet_subsys(&net_ns_ops))
+ panic("Could not register network namespace subsystems");
rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
RTNL_FLAG_DOIT_UNLOCKED);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2b9fdbc43205..361aabffb8c0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np)
np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
- err = dev_open(ndev);
+ err = dev_open(ndev, NULL);
if (err) {
np_err(np, "failed to open %s\n", ndev->name);
@@ -801,7 +801,7 @@ void __netpoll_cleanup(struct netpoll *np)
ops->ndo_netpoll_cleanup(np->dev);
RCU_INIT_POINTER(np->dev->npinfo, NULL);
- call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
+ call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
} else
RCU_INIT_POINTER(np->dev->npinfo, NULL);
}
@@ -812,7 +812,7 @@ void __netpoll_free(struct netpoll *np)
ASSERT_RTNL();
/* Wait for transmitting packets to finish before freeing. */
- synchronize_rcu_bh();
+ synchronize_rcu();
__netpoll_cleanup(np);
kfree(np);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33d9227a8b80..48f61885fd6f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,7 +59,7 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
-#define RTNL_MAX_TYPE 49
+#define RTNL_MAX_TYPE 50
#define RTNL_SLAVE_MAX_TYPE 36
struct rtnl_link {
@@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb,
sa->sa_family = dev->type;
memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
dev->addr_len);
- err = dev_set_mac_address(dev, sa);
+ err = dev_set_mac_address(dev, sa, extack);
kfree(sa);
if (err)
goto errout;
@@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb,
}
if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
if (err < 0)
goto errout;
}
@@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
old_flags = dev->flags;
if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
- err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ NULL);
if (err < 0)
return err;
}
@@ -2885,9 +2887,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
}
EXPORT_SYMBOL(rtnl_configure_link);
-struct net_device *rtnl_create_link(struct net *net,
- const char *ifname, unsigned char name_assign_type,
- const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
struct net_device *dev;
unsigned int num_tx_queues = 1;
@@ -2903,11 +2907,15 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
- if (num_tx_queues < 1 || num_tx_queues > 4096)
+ if (num_tx_queues < 1 || num_tx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
return ERR_PTR(-EINVAL);
+ }
- if (num_rx_queues < 1 || num_rx_queues > 4096)
+ if (num_rx_queues < 1 || num_rx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
return ERR_PTR(-EINVAL);
+ }
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
@@ -2965,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
return 0;
}
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attr, struct netlink_ext_ack *extack)
{
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+ unsigned char name_assign_type = NET_NAME_USER;
+ struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct net_device *master_dev = NULL;
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
- const struct rtnl_link_ops *m_ops = NULL;
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net *dest_net, *link_net;
+ struct nlattr **slave_data;
+ char kind[MODULE_NAME_LEN];
struct net_device *dev;
- struct net_device *master_dev = NULL;
struct ifinfomsg *ifm;
- char kind[MODULE_NAME_LEN];
char ifname[IFNAMSIZ];
- struct nlattr *tb[IFLA_MAX+1];
- struct nlattr *linkinfo[IFLA_INFO_MAX+1];
- unsigned char name_assign_type = NET_NAME_USER;
+ struct nlattr **data;
int err;
#ifdef CONFIG_MODULES
@@ -3034,193 +3046,200 @@ replay:
ops = NULL;
}
- if (1) {
- struct nlattr *attr[RTNL_MAX_TYPE + 1];
- struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
- struct nlattr **data = NULL;
- struct nlattr **slave_data = NULL;
- struct net *dest_net, *link_net = NULL;
-
- if (ops) {
- if (ops->maxtype > RTNL_MAX_TYPE)
- return -EINVAL;
+ data = NULL;
+ if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE)
+ return -EINVAL;
- if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested(attr, ops->maxtype,
- linkinfo[IFLA_INFO_DATA],
- ops->policy, NULL);
- if (err < 0)
- return err;
- data = attr;
- }
- if (ops->validate) {
- err = ops->validate(tb, data, extack);
- if (err < 0)
- return err;
- }
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ err = nla_parse_nested(attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy, extack);
+ if (err < 0)
+ return err;
+ data = attr;
+ }
+ if (ops->validate) {
+ err = ops->validate(tb, data, extack);
+ if (err < 0)
+ return err;
}
+ }
- if (m_ops) {
- if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
- return -EINVAL;
+ slave_data = NULL;
+ if (m_ops) {
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
- if (m_ops->slave_maxtype &&
- linkinfo[IFLA_INFO_SLAVE_DATA]) {
- err = nla_parse_nested(slave_attr,
- m_ops->slave_maxtype,
- linkinfo[IFLA_INFO_SLAVE_DATA],
- m_ops->slave_policy,
- NULL);
- if (err < 0)
- return err;
- slave_data = slave_attr;
- }
+ if (m_ops->slave_maxtype &&
+ linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ err = nla_parse_nested(slave_attr, m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy, extack);
+ if (err < 0)
+ return err;
+ slave_data = slave_attr;
}
+ }
- if (dev) {
- int status = 0;
-
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- return -EEXIST;
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
- return -EOPNOTSUPP;
+ if (dev) {
+ int status = 0;
- if (linkinfo[IFLA_INFO_DATA]) {
- if (!ops || ops != dev->rtnl_link_ops ||
- !ops->changelink)
- return -EOPNOTSUPP;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
- err = ops->changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
- }
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops ||
+ !ops->changelink)
+ return -EOPNOTSUPP;
- if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
- if (!m_ops || !m_ops->slave_changelink)
- return -EOPNOTSUPP;
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
+ }
- err = m_ops->slave_changelink(master_dev, dev,
- tb, slave_data,
- extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
- }
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
- return do_setlink(skb, dev, ifm, extack, tb, ifname,
- status);
+ err = m_ops->slave_changelink(master_dev, dev, tb,
+ slave_data, extack);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
- return rtnl_group_changelink(skb, net,
+ return do_setlink(skb, dev, ifm, extack, tb, ifname, status);
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(skb, net,
nla_get_u32(tb[IFLA_GROUP]),
ifm, extack, tb);
- return -ENODEV;
- }
+ return -ENODEV;
+ }
- if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
- return -EOPNOTSUPP;
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
- if (!ops) {
+ if (!ops) {
#ifdef CONFIG_MODULES
- if (kind[0]) {
- __rtnl_unlock();
- request_module("rtnl-link-%s", kind);
- rtnl_lock();
- ops = rtnl_link_ops_get(kind);
- if (ops)
- goto replay;
- }
-#endif
- return -EOPNOTSUPP;
+ if (kind[0]) {
+ __rtnl_unlock();
+ request_module("rtnl-link-%s", kind);
+ rtnl_lock();
+ ops = rtnl_link_ops_get(kind);
+ if (ops)
+ goto replay;
}
+#endif
+ NL_SET_ERR_MSG(extack, "Unknown device type");
+ return -EOPNOTSUPP;
+ }
- if (!ops->setup)
- return -EOPNOTSUPP;
+ if (!ops->setup)
+ return -EOPNOTSUPP;
- if (!ifname[0]) {
- snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
- name_assign_type = NET_NAME_ENUM;
- }
+ if (!ifname[0]) {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
- dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
+ dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
- if (tb[IFLA_LINK_NETNSID]) {
- int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
- link_net = get_net_ns_by_id(dest_net, id);
- if (!link_net) {
- err = -EINVAL;
- goto out;
- }
- err = -EPERM;
- if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
- goto out;
- }
-
- dev = rtnl_create_link(link_net ? : dest_net, ifname,
- name_assign_type, ops, tb);
- if (IS_ERR(dev)) {
- err = PTR_ERR(dev);
+ link_net = get_net_ns_by_id(dest_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ err = -EINVAL;
goto out;
}
+ err = -EPERM;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ } else {
+ link_net = NULL;
+ }
- dev->ifindex = ifm->ifi_index;
+ dev = rtnl_create_link(link_net ? : dest_net, ifname,
+ name_assign_type, ops, tb, extack);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
- if (ops->newlink) {
- err = ops->newlink(link_net ? : net, dev, tb, data,
- extack);
- /* Drivers should call free_netdev() in ->destructor
- * and unregister it on failure after registration
- * so that device could be finally freed in rtnl_unlock.
- */
- if (err < 0) {
- /* If device is not registered at all, free it now */
- if (dev->reg_state == NETREG_UNINITIALIZED)
- free_netdev(dev);
- goto out;
- }
- } else {
- err = register_netdevice(dev);
- if (err < 0) {
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink) {
+ err = ops->newlink(link_net ? : net, dev, tb, data, extack);
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
free_netdev(dev);
- goto out;
- }
+ goto out;
+ }
+ } else {
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
}
- err = rtnl_configure_link(dev, ifm);
+ }
+ err = rtnl_configure_link(dev, ifm);
+ if (err < 0)
+ goto out_unregister;
+ if (link_net) {
+ err = dev_change_net_namespace(dev, dest_net, ifname);
if (err < 0)
goto out_unregister;
- if (link_net) {
- err = dev_change_net_namespace(dev, dest_net, ifname);
- if (err < 0)
- goto out_unregister;
- }
- if (tb[IFLA_MASTER]) {
- err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
- extack);
- if (err)
- goto out_unregister;
- }
+ }
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto out_unregister;
+ }
out:
- if (link_net)
- put_net(link_net);
- put_net(dest_net);
- return err;
+ if (link_net)
+ put_net(link_net);
+ put_net(dest_net);
+ return err;
out_unregister:
- if (ops->newlink) {
- LIST_HEAD(list_kill);
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- } else {
- unregister_netdevice(dev);
- }
- goto out;
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
}
+ goto out;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **attr;
+ int ret;
+
+ attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ ret = __rtnl_newlink(skb, nlh, attr, extack);
+ kfree(attr);
+ return ret;
}
static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3800,6 +3819,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
{
int err;
+ if (dev->type != ARPHRD_ETHER)
+ return -EINVAL;
+
netif_addr_lock_bh(dev);
err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
@@ -3999,6 +4021,160 @@ out:
return skb->len;
}
+static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u8 *ndm_flags,
+ int *br_idx, int *brport_idx, u8 **addr,
+ u16 *vid, struct netlink_ext_ack *extack)
+{
+ struct ndmsg *ndm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
+ return -EINVAL;
+ }
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
+ nda_policy, extack);
+ if (err < 0)
+ return err;
+
+ *ndm_flags = ndm->ndm_flags;
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_MASTER:
+ *br_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_LLADDR:
+ if (nla_len(tb[i]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "Invalid address in fdb get request");
+ return -EINVAL;
+ }
+ *addr = nla_data(tb[i]);
+ break;
+ case NDA_VLAN:
+ err = fdb_vid_parse(tb[i], vid, extack);
+ if (err)
+ return err;
+ break;
+ case NDA_VNI:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = NULL, *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NDA_MAX + 1];
+ struct sk_buff *skb;
+ int brport_idx = 0;
+ u8 ndm_flags = 0;
+ int br_idx = 0;
+ u8 *addr = NULL;
+ u16 vid = 0;
+ int err;
+
+ err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx,
+ &brport_idx, &addr, &vid, extack);
+ if (err < 0)
+ return err;
+
+ if (brport_idx) {
+ dev = __dev_get_by_index(net, brport_idx);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ return -ENODEV;
+ }
+ }
+
+ if (br_idx) {
+ if (dev) {
+ NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive");
+ return -EINVAL;
+ }
+
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Invalid master ifindex");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ }
+
+ if (dev) {
+ if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
+ if (!(dev->priv_flags & IFF_BRIDGE_PORT)) {
+ NL_SET_ERR_MSG(extack, "Device is not a bridge port");
+ return -EINVAL;
+ }
+ br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Master of device not found");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ } else {
+ if (!(ndm_flags & NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Missing NTF_SELF");
+ return -EINVAL;
+ }
+ ops = dev->netdev_ops;
+ }
+ }
+
+ if (!br_dev && !dev) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return -ENODEV;
+ }
+
+ if (!ops || !ops->ndo_fdb_get) {
+ NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (br_dev)
+ dev = br_dev;
+ err = ops->ndo_fdb_get(skb, tb, dev, addr, vid,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+ if (err)
+ goto out;
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ kfree_skb(skb);
+ return err;
+}
+
static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
unsigned int attrnum, unsigned int flag)
{
@@ -4310,7 +4486,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags,
+ extack);
if (err)
goto out;
@@ -4322,7 +4499,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
err = -EOPNOTSUPP;
else
err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
- flags);
+ flags,
+ extack);
if (!err) {
flags &= ~BRIDGE_FLAGS_SELF;
@@ -5057,7 +5235,7 @@ void __init rtnetlink_init(void)
rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a8217e221e19..37317ffec146 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,9 @@
struct kmem_cache *skbuff_head_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
+#ifdef CONFIG_SKB_EXTENSIONS
+static struct kmem_cache *skbuff_ext_cache __ro_after_init;
+#endif
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -606,7 +609,6 @@ fastpath:
void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
- secpath_reset(skb);
if (skb->destructor) {
WARN_ON(in_irq());
skb->destructor(skb);
@@ -614,9 +616,7 @@ void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb_nfct(skb));
#endif
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- nf_bridge_put(skb->nf_bridge);
-#endif
+ skb_ext_put(skb);
}
/* Free everything but the sk_buff shell. */
@@ -796,9 +796,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->dev = old->dev;
memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
-#ifdef CONFIG_XFRM
- new->sp = secpath_get(old->sp);
-#endif
+ __skb_ext_copy(new, old);
__nf_copy(new, old, false);
/* Note : this field could be in headers_start/headers_end section
@@ -1089,7 +1087,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
}
EXPORT_SYMBOL_GPL(sock_zerocopy_put);
-void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
if (uarg) {
struct sock *sk = skb_from_uarg(uarg)->sk;
@@ -1097,7 +1095,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
- sock_zerocopy_put(uarg);
+ if (have_uref)
+ sock_zerocopy_put(uarg);
}
}
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
@@ -1105,6 +1104,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+{
+ return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
@@ -1131,7 +1136,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return err;
}
- skb_zcopy_set(skb, uarg);
+ skb_zcopy_set(skb, uarg, NULL);
return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@@ -1151,7 +1156,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
return -EIO;
}
- skb_zcopy_set(nskb, skb_uarg(orig));
+ skb_zcopy_set(nskb, skb_uarg(orig), NULL);
}
return 0;
}
@@ -1925,8 +1930,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
struct sk_buff *insp = NULL;
do {
- BUG_ON(!list);
-
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
@@ -2366,19 +2369,6 @@ error:
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
-/* Send skb data on a socket. */
-int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
-{
- int ret = 0;
-
- lock_sock(sk);
- ret = skb_send_sock_locked(sk, skb, offset, len);
- release_sock(sk);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(skb_send_sock);
-
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
@@ -2645,6 +2635,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+/* This function assumes skb->csum already holds pseudo header's checksum,
+ * which has been changed from the hardware checksum, for example, by
+ * __skb_checksum_validate_complete(). And, the original skb->csum must
+ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
+ *
+ * It returns non-zero if the recomputed checksum is still invalid, otherwise
+ * zero. The new checksum is stored back into skb->csum unless the skb is
+ * shared.
+ */
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ sum = csum_fold(csum_add(skb->csum, csum));
+ /* This check is inverted, because we already knew the hardware
+ * checksum is invalid before calling this function. So, if the
+ * re-computed checksum is valid instead, then we have a mismatch
+ * between the original skb->csum and skb_checksum(). This means either
+ * the original hardware checksum is incorrect or we screw up skb->csum
+ * when moving skb->data around.
+ */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
net_warn_ratelimited(
@@ -2962,28 +3011,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head
}
EXPORT_SYMBOL(skb_append);
-/**
- * skb_insert - insert a buffer
- * @old: buffer to insert before
- * @newsk: buffer to insert
- * @list: list to use
- *
- * Place a packet before a given packet in a list. The list locks are
- * taken and this function is atomic with respect to other list locked
- * calls.
- *
- * A buffer cannot be placed on two lists at the same time.
- */
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&list->lock, flags);
- __skb_insert(newsk, old->prev, old, list);
- spin_unlock_irqrestore(&list->lock, flags);
-}
-EXPORT_SYMBOL(skb_insert);
-
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, const int pos)
@@ -3873,6 +3900,46 @@ done:
}
EXPORT_SYMBOL_GPL(skb_gro_receive);
+#ifdef CONFIG_SKB_EXTENSIONS
+#define SKB_EXT_ALIGN_VALUE 8
+#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
+
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
+#endif
+#ifdef CONFIG_XFRM
+ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
+#endif
+};
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+ return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+#ifdef CONFIG_XFRM
+ skb_ext_type_len[SKB_EXT_SEC_PATH] +
+#endif
+ 0;
+}
+
+static void skb_extensions_init(void)
+{
+ BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(skb_ext_total_length() > 255);
+
+ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
+ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+#else
+static void skb_extensions_init(void) {}
+#endif
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
@@ -3887,6 +3954,7 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ skb_extensions_init();
}
static int
@@ -4856,7 +4924,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
- skb->offload_mr_fwd_mark = 0;
+ skb->offload_l3_fwd_mark = 0;
#endif
if (!xnet)
@@ -5128,7 +5196,7 @@ int skb_vlan_pop(struct sk_buff *skb)
int err;
if (likely(skb_vlan_tag_present(skb))) {
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
@@ -5525,3 +5593,148 @@ void skb_condense(struct sk_buff *skb)
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
+}
+
+static struct skb_ext *skb_ext_alloc(void)
+{
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+
+ if (new) {
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ }
+
+ return new;
+}
+
+static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
+ unsigned int old_active)
+{
+ struct skb_ext *new;
+
+ if (refcount_read(&old->refcnt) == 1)
+ return old;
+
+ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+ refcount_set(&new->refcnt, 1);
+
+#ifdef CONFIG_XFRM
+ if (old_active & (1 << SKB_EXT_SEC_PATH)) {
+ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_hold(sp->xvec[i]);
+ }
+#endif
+ __skb_ext_put(old);
+ return new;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extension or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *new, *old = NULL;
+ unsigned int newlen, newoff;
+
+ if (skb->active_extensions) {
+ old = skb->extensions;
+
+ new = skb_ext_maybe_cow(old, skb->active_extensions);
+ if (!new)
+ return NULL;
+
+ if (__skb_ext_exist(new, id))
+ goto set_active;
+
+ newoff = new->chunks;
+ } else {
+ newoff = SKB_EXT_CHUNKSIZEOF(*new);
+
+ new = skb_ext_alloc();
+ if (!new)
+ return NULL;
+ }
+
+ newlen = newoff + skb_ext_type_len[id];
+ new->chunks = newlen;
+ new->offset[id] = newoff;
+set_active:
+ skb->extensions = new;
+ skb->active_extensions |= 1 << id;
+ return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+#ifdef CONFIG_XFRM
+static void skb_ext_put_sp(struct sec_path *sp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_put(sp->xvec[i]);
+}
+#endif
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *ext = skb->extensions;
+
+ skb->active_extensions &= ~(1 << id);
+ if (skb->active_extensions == 0) {
+ skb->extensions = NULL;
+ __skb_ext_put(ext);
+#ifdef CONFIG_XFRM
+ } else if (id == SKB_EXT_SEC_PATH &&
+ refcount_read(&ext->refcnt) == 1) {
+ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
+
+ skb_ext_put_sp(sp);
+ sp->len = 0;
+#endif
+ }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_put(struct skb_ext *ext)
+{
+ /* If this is last clone, nothing can increment
+ * it after check passes. Avoids one atomic op.
+ */
+ if (refcount_read(&ext->refcnt) == 1)
+ goto free_now;
+
+ if (!refcount_dec_and_test(&ext->refcnt))
+ return;
+free_now:
+#ifdef CONFIG_XFRM
+ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
+ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
+#endif
+
+ kmem_cache_free(skbuff_ext_cache, ext);
+}
+EXPORT_SYMBOL(__skb_ext_put);
+#endif /* CONFIG_SKB_EXTENSIONS */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 56a99d0c9aa0..d6d5c20d7044 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -94,6 +94,9 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
}
while (len) {
+ if (sk_msg_full(dst))
+ return -ENOSPC;
+
sge_len = sge->length - off;
sge_off = sge->offset + off;
if (sge_len > len)
@@ -403,7 +406,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
msg->skb = skb;
sk_psock_queue_msg(psock, msg);
- sk->sk_data_ready(sk);
+ sk_psock_data_ready(sk, psock);
return copied;
}
@@ -572,6 +575,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
rcu_assign_sk_user_data(sk, NULL);
sk_psock_cork_free(psock);
+ sk_psock_zap_ingress(psock);
sk_psock_restore_proto(sk, psock);
write_lock_bh(&sk->sk_callback_lock);
@@ -580,7 +584,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu_sched(&psock->rcu, sk_psock_destroy);
+ call_rcu(&psock->rcu, sk_psock_destroy);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -669,6 +673,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
bool ingress;
switch (verdict) {
+ case __SK_PASS:
+ sk_other = psock->sk;
+ if (sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ goto out_free;
+ }
+ if (atomic_read(&sk_other->sk_rmem_alloc) <=
+ sk_other->sk_rcvbuf) {
+ struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
+
+ tcp->bpf.flags |= BPF_F_INGRESS;
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ break;
+ }
+ goto out_free;
case __SK_REDIRECT:
sk_other = tcp_skb_bpf_redirect_fetch(skb);
if (unlikely(!sk_other))
@@ -735,7 +755,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
}
/* Called with socket lock held. */
-static void sk_psock_data_ready(struct sock *sk)
+static void sk_psock_strp_data_ready(struct sock *sk)
{
struct sk_psock *psock;
@@ -783,7 +803,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
return;
parser->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 080a880a1761..f00902c532cc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
lock_sock(sk);
sk->sk_bound_dev_if = index;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
@@ -698,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ sk_dst_reset(sk);
break;
case SO_BROADCAST:
sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
@@ -950,10 +953,12 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- else
+ } else if (val != sk->sk_mark) {
sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
break;
case SO_RXQ_OVFL:
@@ -1014,7 +1019,10 @@ set_rcvbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (!((sk->sk_type == SOCK_STREAM &&
+ sk->sk_protocol == IPPROTO_TCP) ||
+ (sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP)))
ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index ba5cba56f574..d8fe3e549373 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
+EXPORT_SYMBOL(reuseport_add_sock);
void reuseport_detach_sock(struct sock *sk)
{
diff --git a/net/core/stream.c b/net/core/stream.c
index 7d329fb1f553..e94bb02a5629 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk)
struct socket *sock = sk->sk_socket;
struct socket_wq *wq;
- if (sk_stream_is_writeable(sk) && sock) {
+ if (__sk_stream_is_writeable(sk, 1) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 37b4667128a3..d67ec17f2cc8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -28,6 +28,8 @@ static int two __maybe_unused = 2;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
+static long long_one __maybe_unused = 1;
+static long long_max __maybe_unused = LONG_MAX;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
+
+static int
+proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
static struct ctl_table net_core_table[] = {
@@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = {
{
.procname = "bpf_jit_limit",
.data = &bpf_jit_limit,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0600,
- .proc_handler = proc_dointvec_minmax_bpf_restricted,
- .extra1 = &one,
+ .proc_handler = proc_dolongvec_minmax_bpf_restricted,
+ .extra1 = &long_one,
+ .extra2 = &long_max,
},
#endif
{
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8e08cea6f178..26a21d97b6b0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err);
* check at all. A more general error queue to queue errors for later handling
* is probably better.
*/
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
+static int dccp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (struct iphdr *)skb->data;
const u8 offset = iph->ihl << 2;
@@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ dccp_req_err(sk, seq);
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b18a6a..d5740bad5b18 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
}
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ dccp_req_err(sk, seq);
+ return 0;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 43733accf58e..2cc5fbb1b29e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -948,6 +948,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -960,7 +961,6 @@ int inet_dccp_listen(struct socket *sock, int backlog)
if (err)
goto out;
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
@@ -1139,8 +1139,11 @@ static int __init dccp_init(void)
rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
if (rc)
goto out_fail;
- rc = -ENOBUFS;
inet_hashinfo_init(&dccp_hashinfo);
+ rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
+ if (rc)
+ goto out_fail;
+ rc = -ENOBUFS;
dccp_hashinfo.bind_bucket_cachep =
kmem_cache_create("dccp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 7d6ff983ba2c..bdccc46a2921 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -192,7 +192,7 @@ static int check_port(__le16 port)
static unsigned short port_alloc(struct sock *sk)
{
struct dn_scp *scp = DN_SK(sk);
-static unsigned short port = 0x2000;
+ static unsigned short port = 0x2000;
unsigned short i_port = port;
while(check_port(cpu_to_le16(++port)) != 0) {
@@ -2405,7 +2405,7 @@ static void __exit decnet_exit(void)
proto_unregister(&dn_proto);
- rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
module_exit(decnet_exit);
#endif
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 48c41918fb35..91e52973ee13 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -44,6 +44,10 @@ config NET_DSA_TAG_GSWIP
config NET_DSA_TAG_KSZ
bool
+config NET_DSA_TAG_KSZ9477
+ bool
+ select NET_DSA_TAG_KSZ
+
config NET_DSA_TAG_LAN9303
bool
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index a69c1790bbfc..aee909bcddc4 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -55,8 +55,8 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
#ifdef CONFIG_NET_DSA_TAG_GSWIP
[DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops,
#endif
-#ifdef CONFIG_NET_DSA_TAG_KSZ
- [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
+#ifdef CONFIG_NET_DSA_TAG_KSZ9477
+ [DSA_TAG_PROTO_KSZ9477] = &ksz9477_netdev_ops,
#endif
#ifdef CONFIG_NET_DSA_TAG_LAN9303
[DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops,
@@ -91,8 +91,8 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
#ifdef CONFIG_NET_DSA_TAG_GSWIP
[DSA_TAG_PROTO_GSWIP] = "gswip",
#endif
-#ifdef CONFIG_NET_DSA_TAG_KSZ
- [DSA_TAG_PROTO_KSZ] = "ksz",
+#ifdef CONFIG_NET_DSA_TAG_KSZ9477
+ [DSA_TAG_PROTO_KSZ9477] = "ksz9477",
#endif
#ifdef CONFIG_NET_DSA_TAG_LAN9303
[DSA_TAG_PROTO_LAN9303] = "lan9303",
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9e4fd04ab53c..026a05774bf7 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -210,7 +210,7 @@ extern const struct dsa_device_ops edsa_netdev_ops;
extern const struct dsa_device_ops gswip_netdev_ops;
/* tag_ksz.c */
-extern const struct dsa_device_ops ksz_netdev_ops;
+extern const struct dsa_device_ops ksz9477_netdev_ops;
/* tag_lan9303.c */
extern const struct dsa_device_ops lan9303_netdev_ops;
diff --git a/net/dsa/master.c b/net/dsa/master.c
index c90ee3227dea..71bb15f491c8 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -158,8 +158,59 @@ static void dsa_master_ethtool_teardown(struct net_device *dev)
cpu_dp->orig_ethtool_ops = NULL;
}
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+ return sprintf(buf, "%s\n",
+ dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+static DEVICE_ATTR_RO(tagging);
+
+static struct attribute *dsa_slave_attrs[] = {
+ &dev_attr_tagging.attr,
+ NULL
+};
+
+static const struct attribute_group dsa_group = {
+ .name = "dsa",
+ .attrs = dsa_slave_attrs,
+};
+
+static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp)
+{
+ unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead;
+ int err;
+
+ rtnl_lock();
+ if (mtu <= dev->max_mtu) {
+ err = dev_set_mtu(dev, mtu);
+ if (err)
+ netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n");
+ }
+ rtnl_unlock();
+}
+
+static void dsa_master_reset_mtu(struct net_device *dev)
+{
+ int err;
+
+ rtnl_lock();
+ err = dev_set_mtu(dev, ETH_DATA_LEN);
+ if (err)
+ netdev_dbg(dev,
+ "Unable to reset MTU to exclude DSA overheads\n");
+ rtnl_unlock();
+}
+
int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
+ int ret;
+
+ dsa_master_set_mtu(dev, cpu_dp);
+
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
* sent to the tag format's receive function.
@@ -168,12 +219,22 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
dev->dsa_ptr = cpu_dp;
- return dsa_master_ethtool_setup(dev);
+ ret = dsa_master_ethtool_setup(dev);
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+ if (ret)
+ dsa_master_ethtool_teardown(dev);
+
+ return ret;
}
void dsa_master_teardown(struct net_device *dev)
{
+ sysfs_remove_group(&dev->dev.kobj, &dsa_group);
dsa_master_ethtool_teardown(dev);
+ dsa_master_reset_mtu(dev);
dev->dsa_ptr = NULL;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ed0595459df1..2d7e01b23572 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,9 +252,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
- if (netif_is_bridge_master(vlan->obj.orig_dev))
- return -EOPNOTSUPP;
-
if (br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7d0c19e7edcf..a3fcc1d01615 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1050,35 +1050,12 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
static const struct switchdev_ops dsa_slave_switchdev_ops = {
.switchdev_port_attr_get = dsa_slave_port_attr_get,
.switchdev_port_attr_set = dsa_slave_port_attr_set,
- .switchdev_port_obj_add = dsa_slave_port_obj_add,
- .switchdev_port_obj_del = dsa_slave_port_obj_del,
};
static struct device_type dsa_type = {
.name = "dsa",
};
-static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
- char *buf)
-{
- struct net_device *dev = to_net_dev(d);
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return sprintf(buf, "%s\n",
- dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops));
-}
-static DEVICE_ATTR_RO(tagging);
-
-static struct attribute *dsa_slave_attrs[] = {
- &dev_attr_tagging.attr,
- NULL
-};
-
-static const struct attribute_group dsa_group = {
- .name = "dsa",
- .attrs = dsa_slave_attrs,
-};
-
static void dsa_slave_phylink_validate(struct net_device *dev,
unsigned long *supported,
struct phylink_link_state *state)
@@ -1374,14 +1351,8 @@ int dsa_slave_create(struct dsa_port *port)
goto out_phy;
}
- ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group);
- if (ret)
- goto out_unreg;
-
return 0;
-out_unreg:
- unregister_netdev(slave_dev);
out_phy:
rtnl_lock();
phylink_disconnect_phy(p->dp->pl);
@@ -1405,7 +1376,6 @@ void dsa_slave_destroy(struct net_device *slave_dev)
rtnl_unlock();
dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
- sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group);
unregister_netdev(slave_dev);
phylink_destroy(dp->pl);
free_percpu(p->stats64);
@@ -1557,6 +1527,44 @@ err_fdb_work_init:
return NOTIFY_BAD;
}
+static int
+dsa_slave_switchdev_port_obj_event(unsigned long event,
+ struct net_device *netdev,
+ struct switchdev_notifier_port_obj_info *port_obj_info)
+{
+ int err = -EOPNOTSUPP;
+
+ switch (event) {
+ case SWITCHDEV_PORT_OBJ_ADD:
+ err = dsa_slave_port_obj_add(netdev, port_obj_info->obj,
+ port_obj_info->trans);
+ break;
+ case SWITCHDEV_PORT_OBJ_DEL:
+ err = dsa_slave_port_obj_del(netdev, port_obj_info->obj);
+ break;
+ }
+
+ port_obj_info->handled = true;
+ return notifier_from_errno(err);
+}
+
+static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+
+ if (!dsa_slave_dev_check(dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case SWITCHDEV_PORT_OBJ_ADD: /* fall through */
+ case SWITCHDEV_PORT_OBJ_DEL:
+ return dsa_slave_switchdev_port_obj_event(event, dev, ptr);
+ }
+
+ return NOTIFY_DONE;
+}
+
static struct notifier_block dsa_slave_nb __read_mostly = {
.notifier_call = dsa_slave_netdevice_event,
};
@@ -1565,8 +1573,13 @@ static struct notifier_block dsa_slave_switchdev_notifier = {
.notifier_call = dsa_slave_switchdev_event,
};
+static struct notifier_block dsa_slave_switchdev_blocking_notifier = {
+ .notifier_call = dsa_slave_switchdev_blocking_event,
+};
+
int dsa_slave_register_notifier(void)
{
+ struct notifier_block *nb;
int err;
err = register_netdevice_notifier(&dsa_slave_nb);
@@ -1577,8 +1590,15 @@ int dsa_slave_register_notifier(void)
if (err)
goto err_switchdev_nb;
+ nb = &dsa_slave_switchdev_blocking_notifier;
+ err = register_switchdev_blocking_notifier(nb);
+ if (err)
+ goto err_switchdev_blocking_nb;
+
return 0;
+err_switchdev_blocking_nb:
+ unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
err_switchdev_nb:
unregister_netdevice_notifier(&dsa_slave_nb);
return err;
@@ -1586,8 +1606,14 @@ err_switchdev_nb:
void dsa_slave_unregister_notifier(void)
{
+ struct notifier_block *nb;
int err;
+ nb = &dsa_slave_switchdev_blocking_notifier;
+ err = unregister_switchdev_blocking_notifier(nb);
+ if (err)
+ pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
+
err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
if (err)
pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 2b06bb91318b..4aa1d368a5ae 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -174,6 +174,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops brcm_netdev_ops = {
.xmit = brcm_tag_xmit,
.rcv = brcm_tag_rcv,
+ .overhead = BRCM_TAG_LEN,
};
#endif
@@ -196,5 +197,6 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
const struct dsa_device_ops brcm_prepend_netdev_ops = {
.xmit = brcm_tag_xmit_prepend,
.rcv = brcm_tag_rcv_prepend,
+ .overhead = BRCM_TAG_LEN,
};
#endif
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index cd13cfc542ce..8b2f92e3f3a2 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -149,4 +149,5 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops dsa_netdev_ops = {
.xmit = dsa_xmit,
.rcv = dsa_rcv,
+ .overhead = DSA_HLEN,
};
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 4083326b806e..f5b87ee5c94e 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -168,4 +168,5 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops edsa_netdev_ops = {
.xmit = edsa_xmit,
.rcv = edsa_rcv,
+ .overhead = EDSA_HLEN,
};
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index 49e9b73f1be3..cb6f82ffe5eb 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -106,4 +106,5 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
const struct dsa_device_ops gswip_netdev_ops = {
.xmit = gswip_tag_xmit,
.rcv = gswip_tag_rcv,
+ .overhead = GSWIP_RX_HEADER_LEN,
};
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 0f62effad88f..da71b9e2af52 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -14,34 +14,18 @@
#include <net/dsa.h>
#include "dsa_priv.h"
-/* For Ingress (Host -> KSZ), 2 bytes are added before FCS.
- * ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
- * ---------------------------------------------------------------------------
- * tag0 : Prioritization (not used now)
- * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
- *
- * For Egress (KSZ -> Host), 1 byte is added before FCS.
- * ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
- * ---------------------------------------------------------------------------
- * tag0 : zero-based value represents port
- * (eg, 0x00=port1, 0x02=port3, 0x06=port7)
- */
-
-#define KSZ_INGRESS_TAG_LEN 2
-#define KSZ_EGRESS_TAG_LEN 1
+/* Typically only one byte is used for tail tag. */
+#define KSZ_EGRESS_TAG_LEN 1
-static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *ksz_common_xmit(struct sk_buff *skb,
+ struct net_device *dev, int len)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
struct sk_buff *nskb;
int padlen;
- u8 *tag;
padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
- if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
+ if (skb_tailroom(skb) >= padlen + len) {
/* Let dsa_slave_xmit() free skb */
if (__skb_put_padto(skb, skb->len + padlen, false))
return NULL;
@@ -49,7 +33,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
nskb = skb;
} else {
nskb = alloc_skb(NET_IP_ALIGN + skb->len +
- padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC);
+ padlen + len, GFP_ATOMIC);
if (!nskb)
return NULL;
skb_reserve(nskb, NET_IP_ALIGN);
@@ -70,33 +54,88 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
consume_skb(skb);
}
- tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
- tag[0] = 0;
- tag[1] = 1 << dp->index; /* destination port */
-
return nskb;
}
-static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned int port, unsigned int len)
{
- u8 *tag;
- int source_port;
+ skb->dev = dsa_master_find_slave(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
- tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ pskb_trim_rcsum(skb, skb->len - len);
- source_port = tag[0] & 7;
+ return skb;
+}
- skb->dev = dsa_master_find_slave(dev, 0, source_port);
- if (!skb->dev)
+/*
+ * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : Prioritization (not used now)
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ9477 -> Host), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x06=port7)
+ */
+
+#define KSZ9477_INGRESS_TAG_LEN 2
+#define KSZ9477_PTP_TAG_LEN 4
+#define KSZ9477_PTP_TAG_INDICATION 0x80
+
+#define KSZ9477_TAIL_TAG_OVERRIDE BIT(9)
+#define KSZ9477_TAIL_TAG_LOOKUP BIT(10)
+
+static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct sk_buff *nskb;
+ u16 *tag;
+ u8 *addr;
+
+ nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN);
+ if (!nskb)
return NULL;
- pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN);
+ /* Tag encoding */
+ tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN);
+ addr = skb_mac_header(nskb);
- return skb;
+ *tag = BIT(dp->index);
+
+ if (is_link_local_ether_addr(addr))
+ *tag |= KSZ9477_TAIL_TAG_OVERRIDE;
+
+ *tag = cpu_to_be16(*tag);
+
+ return nskb;
+}
+
+static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt)
+{
+ /* Tag decoding */
+ u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ unsigned int port = tag[0] & 7;
+ unsigned int len = KSZ_EGRESS_TAG_LEN;
+
+ /* Extra 4-bytes PTP timestamp */
+ if (tag[0] & KSZ9477_PTP_TAG_INDICATION)
+ len += KSZ9477_PTP_TAG_LEN;
+
+ return ksz_common_rcv(skb, dev, port, len);
}
-const struct dsa_device_ops ksz_netdev_ops = {
- .xmit = ksz_xmit,
- .rcv = ksz_rcv,
+const struct dsa_device_ops ksz9477_netdev_ops = {
+ .xmit = ksz9477_xmit,
+ .rcv = ksz9477_rcv,
+ .overhead = KSZ9477_INGRESS_TAG_LEN,
};
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 548c00254c07..f48889e46ff7 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -140,4 +140,5 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops lan9303_netdev_ops = {
.xmit = lan9303_xmit,
.rcv = lan9303_rcv,
+ .overhead = LAN9303_TAG_LEN,
};
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 11535bc70743..f39f4dfeda34 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -109,4 +109,5 @@ const struct dsa_device_ops mtk_netdev_ops = {
.xmit = mtk_tag_xmit,
.rcv = mtk_tag_rcv,
.flow_dissect = mtk_tag_flow_dissect,
+ .overhead = MTK_HDR_LEN,
};
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 613f4ee97771..ed4f6dc26365 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -101,4 +101,5 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops qca_netdev_ops = {
.xmit = qca_tag_xmit,
.rcv = qca_tag_rcv,
+ .overhead = QCA_HDR_LEN,
};
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 56197f0d9608..b40756ed6e57 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -84,4 +84,5 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
const struct dsa_device_ops trailer_netdev_ops = {
.xmit = trailer_xmit,
.rcv = trailer_rcv,
+ .overhead = 4,
};
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index fd8faa0dfa61..4c520110b04f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -47,6 +47,7 @@
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
@@ -165,15 +166,17 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
eth = (struct ethhdr *)skb->data;
skb_pull_inline(skb, ETH_HLEN);
- if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
- if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
- skb->pkt_type = PACKET_BROADCAST;
- else
- skb->pkt_type = PACKET_MULTICAST;
+ if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
+ dev->dev_addr))) {
+ if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
+ if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_MULTICAST;
+ } else {
+ skb->pkt_type = PACKET_OTHERHOST;
+ }
}
- else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
- dev->dev_addr)))
- skb->pkt_type = PACKET_OTHERHOST;
/*
* Some variants of DSA tagging don't have an ethertype field
@@ -548,3 +551,40 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);
+
+/**
+ * Obtain the MAC address from an nvmem cell named 'mac-address' associated
+ * with given device.
+ *
+ * @dev: Device with which the mac-address cell is associated.
+ * @addrbuf: Buffer to which the MAC address will be copied on success.
+ *
+ * Returns 0 on success or a negative error number on failure.
+ */
+int nvmem_get_mac_address(struct device *dev, void *addrbuf)
+{
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+
+ cell = nvmem_cell_get(dev, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ ether_addr_copy(addrbuf, mac);
+ kfree(mac);
+
+ return 0;
+}
+EXPORT_SYMBOL(nvmem_get_mac_address);
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index ca53efa17be1..8bec827081cd 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -48,6 +48,9 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct neighbour *n;
+ if (!daddr)
+ return -EINVAL;
+
/* TODO:
* if this package isn't ipv6 one, where should it be routed?
*/
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index b231e40f006a..0c25c0bcc4da 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -242,7 +242,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
* dev_set_mac_address require RTNL_LOCK
*/
rtnl_lock();
- rc = dev_set_mac_address(dev, &addr);
+ rc = dev_set_mac_address(dev, &addr, NULL);
rtnl_unlock();
if (rc)
goto dev_unregister;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1fbe2f815474..0dfb72c46671 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
@@ -1385,6 +1385,10 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
+ struct sk_buff *));
struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
const struct net_offload *ops;
@@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
@@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return -EINVAL;
}
+INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
@@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
* because any hdr with option will have been flushed in
* inet_gro_receive().
*/
- err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+ err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
+ tcp4_gro_complete, udp4_gro_complete,
+ skb, nhoff + sizeof(*iph));
out_unlock:
rcu_read_unlock();
@@ -1964,6 +1973,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
ping_init();
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a34602ae27de..04ba321ae5ce 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -952,17 +952,18 @@ static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ipv4_is_zeronet(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
rc = 0;
else {
__u32 haddr = ntohl(addr);
-
if (IN_CLASSA(haddr))
rc = 8;
else if (IN_CLASSB(haddr))
rc = 16;
else if (IN_CLASSC(haddr))
rc = 24;
+ else if (IN_CLASSE(haddr))
+ rc = 32;
}
return rc;
@@ -1100,7 +1101,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
inet_del_ifa(in_dev, ifap, 1);
break;
}
- ret = dev_change_flags(dev, ifr->ifr_flags);
+ ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
break;
case SIOCSIFADDR: /* Set interface address (and family) */
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9e1c840596c5..5459f41fc26f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)
void *tmp;
struct xfrm_state *x;
- if (xo && (xo->flags & XFRM_DEV_RESUME))
- x = skb->sp->xvec[skb->sp->len - 1];
- else
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
x = skb_dst(skb)->xfrm;
+ }
tmp = ESP_SKB_CB(skb)->tmp;
esp_ssg_unref(x, tmp);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 58834a10c0be..8756e0e790d2 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -46,11 +46,12 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
xo = xfrm_offload(skb);
if (!xo || !(xo->flags & CRYPTO_DONE)) {
- err = secpath_set(skb);
- if (err)
+ struct sec_path *sp = secpath_set(skb);
+
+ if (!sp)
goto out;
- if (skb->sp->len == XFRM_MAX_DEPTH)
+ if (sp->len == XFRM_MAX_DEPTH)
goto out;
x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
@@ -59,8 +60,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
if (!x)
goto out;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
xo = xfrm_offload(skb);
if (!xo) {
@@ -114,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
struct crypto_aead *aead;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return ERR_PTR(-EINVAL);
@@ -121,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
return ERR_PTR(-EINVAL);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
if (!fi)
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
- cfg->fc_mx_len);
+ cfg->fc_mx_len, extack);
if (unlikely(IS_ERR(fi->fib_metrics))) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 500a59906b87..0c9f171fb085 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -3,6 +3,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -1003,15 +1004,89 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
+ .err_handler = gue_err,
};
static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
+ .err_handler = gue_err,
};
static int ip_tunnel_encap_add_fou_ops(void)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ drop:
return NET_RX_DROP;
}
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
if (ver >= GREPROTO_MAX)
- return;
+ return -EINVAL;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
if (proto && proto->err_handler)
proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
rcu_read_unlock();
+
+ return err;
}
static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ error:
goto drop;
}
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
*/
if (icmph->type != ICMP_ECHOREPLY) {
ping_err(skb, offset, info);
- return;
+ return 0;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
}
/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 15e7f7915a21..6ea523d71947 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ int l3mdev;
+ l3mdev = inet_sk_bound_l3mdev(sk);
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +221,8 @@ other_parity_scan:
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false))
goto success;
goto next_port;
@@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL;
kuid_t uid = sock_i_uid(sk);
+ int l3mdev;
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &port);
@@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
@@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4e5bc4b2f14e..1a4e9ff02762 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -998,7 +998,9 @@ next_chunk:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
- sock_hold(sk);
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
num_arr[accum] = num;
sk_arr[accum] = sk;
if (++accum == SKARR_SZ)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 411dd7a90046..942265d65eb3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
+ int l3mdev;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
@@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
return -ENOENT;
}
if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket's icsk_bind_hash is the same
@@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
- tb->port == port)
+ tb->l3mdev == l3mdev && tb->port == port)
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
- sock_net(sk), head, port);
+ sock_net(sk), head, port,
+ l3mdev);
if (!tb) {
spin_unlock(&head->lock);
return -ENOMEM;
@@ -228,26 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
const int dif, const int sdif, bool exact_dif)
{
int score = -1;
- struct inet_sock *inet = inet_sk(sk);
- if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+ if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
!ipv6_only_sock(sk)) {
- __be32 rcv_saddr = inet->inet_rcv_saddr;
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
+
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+
score = sk->sk_family == PF_INET ? 2 : 1;
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- return -1;
- score += 4;
- }
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
@@ -303,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- bool exact_dif = inet_exact_dif_match(net, skb);
struct inet_listen_hashbucket *ilb2;
- struct sock *sk, *result = NULL;
- int score, hiscore = 0;
+ struct sock *result = NULL;
unsigned int hash2;
- u32 phash = 0;
-
- if (ilb->count <= 10 || !hashinfo->lhash2)
- goto port_lookup;
-
- /* Too many sk in the ilb bucket (which is hashed by port alone).
- * Try lhash2 (which is hashed by port and addr) instead.
- */
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
@@ -331,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net,
goto done;
/* Lookup lhash2 with INADDR_ANY */
-
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
+ saddr, sport, htonl(INADDR_ANY), hnum,
dif, sdif);
- goto done;
-
-port_lookup:
- sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
- if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- goto done;
- }
- result = sk;
- hiscore = score;
- }
- }
done:
if (unlikely(IS_ERR(result)))
return NULL;
@@ -675,6 +635,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 remaining, offset;
int ret, i, low, high;
static u32 hint;
+ int l3mdev;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -693,6 +654,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return ret;
}
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
@@ -719,7 +682,8 @@ other_parity_scan:
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
@@ -732,7 +696,7 @@ other_parity_scan:
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
@@ -798,13 +762,22 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+static void init_hashinfo_lhash2(struct inet_hashinfo *h)
+{
+ int i;
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_HEAD(&h->lhash2[i].head);
+ h->lhash2[i].count = 0;
+ }
+}
+
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
unsigned long high_limit)
{
- unsigned int i;
-
h->lhash2 = alloc_large_system_hash(name,
sizeof(*h->lhash2),
numentries,
@@ -814,13 +787,23 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
&h->lhash2_mask,
low_limit,
high_limit);
+ init_hashinfo_lhash2(h);
+}
- for (i = 0; i <= h->lhash2_mask; i++) {
- spin_lock_init(&h->lhash2[i].lock);
- INIT_HLIST_HEAD(&h->lhash2[i].head);
- h->lhash2[i].count = 0;
- }
+int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
+{
+ h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
+ if (!h->lhash2)
+ return -ENOMEM;
+
+ h->lhash2_mask = INET_LHTABLE_SIZE - 1;
+ /* INET_LHTABLE_SIZE must be a power of 2 */
+ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
+
+ init_hashinfo_lhash2(h);
+ return 0;
}
+EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 32662e9e5d21..00ec819f949b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -69,9 +69,17 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+
if (unlikely(opt->optlen))
ip_forward_options(skb);
+ skb->tstamp = 0;
return dst_output(net, sk, skb);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d6ee343fdb86..867be8f7f1fa 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -346,10 +346,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
struct sk_buff *skb1, *prev_tail;
+ int ihl, end, skb1_run_end;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
- int ihl, end;
int err = -ENOENT;
u8 ecn;
@@ -419,7 +419,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
- * We do the same here for IPv4 (and increment an snmp counter).
+ * We do the same here for IPv4 (and increment an snmp counter) but
+ * we do not want to drop the whole queue in response to a duplicate
+ * fragment.
*/
err = -EINVAL;
@@ -444,13 +446,17 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
+ skb1_run_end = skb1->ip_defrag_offset +
+ FRAG_CB(skb1)->frag_run_len;
if (end <= skb1->ip_defrag_offset)
rbn = &parent->rb_left;
- else if (offset >= skb1->ip_defrag_offset +
- FRAG_CB(skb1)->frag_run_len)
+ else if (offset >= skb1_run_end)
rbn = &parent->rb_right;
- else /* Found an overlap with skb1. */
- goto overlap;
+ else if (offset >= skb1->ip_defrag_offset &&
+ end <= skb1_run_end)
+ goto err; /* No new data, potential duplicate */
+ else
+ goto overlap; /* Found an overlap */
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
@@ -515,6 +521,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct rb_node *rbn;
int len;
int ihlen;
+ int delta;
int err;
u8 ecn;
@@ -556,10 +563,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
if (len > 65535)
goto out_oversize;
+ delta = - head->truesize;
+
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(qp->q.net, delta);
+
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..c7a7bd58a23c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;
-static void ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
unsigned int data_len = 0;
struct ip_tunnel *t;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return -ENOENT;
+
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return 0;
data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
break;
}
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else if (tpi->proto == htons(ETH_P_ERSPAN) ||
- tpi->proto == htons(ETH_P_ERSPAN2))
- itn = net_generic(net, erspan_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
- iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
- iph->daddr, iph->saddr, tpi->key);
-
- if (!t)
- return;
-
#if IS_ENABLED(CONFIG_IPV6)
if (tpi->proto == htons(ETH_P_IPV6) &&
!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
type, data_len))
- return;
+ return 0;
#endif
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return 0;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static void gre_err(struct sk_buff *skb, u32 info)
@@ -1339,12 +1341,6 @@ static void ipgre_tap_setup(struct net_device *dev)
ip_tunnel_setup(dev, gre_tap_net_id);
}
-bool is_gretap_dev(const struct net_device *dev)
-{
- return dev->netdev_ops == &gre_tap_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(is_gretap_dev);
-
static int ipgre_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@@ -1601,7 +1597,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
- &ipgre_tap_ops, tb);
+ &ipgre_tap_ops, tb, NULL);
if (IS_ERR(dev))
return dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..26921f6b3b92 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol;
- const struct net_protocol *ipprot;
- int raw;
+ const struct net_protocol *ipprot;
+ int raw, ret;
- resubmit:
- raw = raw_local_deliver(skb, protocol);
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
} else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
@@ -547,7 +546,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
list_for_each_entry_safe(skb, next, head, list) {
struct dst_entry *dst;
- list_del(&skb->list);
+ skb_list_del_init(skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -594,7 +593,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- list_del(&skb->list);
+ skb_list_del_init(skb);
skb = ip_rcv_core(skb, net);
if (skb == NULL)
continue;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..c80188875f39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
@@ -867,6 +868,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
@@ -880,8 +882,8 @@ static int __ip_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
+ bool paged, extra_uref;
u32 tskey = 0;
- bool paged;
skb = skb_peek_tail(queue);
@@ -916,6 +918,20 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = true;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+
cork->length += length;
/* So, what's going on in the loop below?
@@ -939,7 +955,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
- unsigned int pagedlen = 0;
+ unsigned int pagedlen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -956,6 +972,7 @@ alloc_new_skb:
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
@@ -1000,12 +1017,6 @@ alloc_new_skb:
skb->csum = 0;
skb_reserve(skb, hh_len);
- /* only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes.
*/
@@ -1038,6 +1049,13 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1067,7 +1085,7 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1097,6 +1115,10 @@ alloc_new_skb:
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1109,6 +1131,8 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ if (uarg)
+ sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c248e0dccbe1..9a0e67b52a4e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
}
skb_clear_hash_if_not_l4(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb_set_queue_mapping(skb, 0);
skb_scrub_packet(skb, xnet);
@@ -151,6 +151,7 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
sizeof(struct in6_addr));
else
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ dst->key.tun_flags = src->key.tun_flags;
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
return res;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 88212615bf4c..b9a9873c25c6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -220,7 +220,7 @@ static int __init ic_open_devs(void)
for_each_netdev(&init_net, dev) {
if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
continue;
- if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
pr_err("IP-Config: Failed to open %s\n", dev->name);
}
@@ -238,7 +238,7 @@ static int __init ic_open_devs(void)
if (ic_proto_enabled && !able)
continue;
oflags = dev->flags;
- if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
pr_err("IP-Config: Failed to open %s\n",
dev->name);
continue;
@@ -315,7 +315,7 @@ static void __init ic_close_devs(void)
dev = d->dev;
if (d != ic_dev && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
- dev_change_flags(dev, d->flags);
+ dev_change_flags(dev, d->flags, NULL);
}
kfree(d);
}
@@ -429,6 +429,8 @@ static int __init ic_defaults(void)
ic_netmask = htonl(IN_CLASSB_NET);
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
+ else if (IN_CLASSE(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSE_NET);
else {
pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
&ic_myaddr);
@@ -1361,18 +1363,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)
}
return 0;
}
-
-static int ntp_servers_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ntp_servers_seq_show, NULL);
-}
-
-static const struct file_operations ntp_servers_seq_fops = {
- .open = ntp_servers_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq);
#endif /* CONFIG_PROC_FS */
/*
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct ip_tunnel *t;
int err = 0;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
+ }
+
switch (type) {
case ICMP_DEST_UNREACH:
switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
goto out;
}
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (!t) {
- err = -ENOENT;
- goto out;
- }
-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a6defbec4f1b..ddbf8c9a1abb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -69,6 +69,8 @@
#include <net/nexthop.h>
#include <net/switchdev.h>
+#include <linux/nospec.h>
+
struct ipmr_rule {
struct fib_rule common;
};
@@ -506,7 +508,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
dev->flags |= IFF_MULTICAST;
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
}
@@ -589,7 +591,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -1612,6 +1614,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
@@ -1686,6 +1689,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
@@ -1802,7 +1806,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
struct vif_device *out_vif = &mrt->vif_table[out_vifi];
struct vif_device *in_vif = &mrt->vif_table[in_vifi];
- if (!skb->offload_mr_fwd_mark)
+ if (!skb->offload_l3_fwd_mark)
return false;
if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
return false;
@@ -1820,8 +1824,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- int in_vifi, struct sk_buff *skb,
- struct mfc_cache *c, int vifi)
+ int in_vifi, struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
@@ -2027,7 +2030,7 @@ forward:
if (skb2)
ipmr_queue_xmit(net, mrt, true_vifi,
- skb2, c, psend);
+ skb2, psend);
}
psend = ct;
}
@@ -2039,9 +2042,9 @@ last_forward:
if (skb2)
ipmr_queue_xmit(net, mrt, true_vifi, skb2,
- c, psend);
+ psend);
} else {
- ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);
return;
}
}
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
#include <net/tcp.h>
static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len, u32 *metrics)
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
if (!type)
continue;
- if (type > RTAX_MAX)
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
return -EINVAL;
+ }
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
+ }
} else {
- if (nla_len(nla) != sizeof(u32))
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
return -EINVAL;
+ }
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
val = 65535 - 15;
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
return -EINVAL;
+ }
metrics[type - 1] = val;
}
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
}
struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len)
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
{
struct dst_metrics *fib_metrics;
int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
if (unlikely(!fib_metrics))
return ERR_PTR(-ENOMEM);
- err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
if (!err) {
refcount_set(&fib_metrics->refcnt, 1);
} else {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 184bf2e0a1ed..80f72cc5ca8d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -156,15 +156,10 @@ config NF_NAT_SNMP_BASIC
To compile it as a module, choose M here. If unsure, say N.
-config NF_NAT_PROTO_GRE
- tristate
- depends on NF_CT_PROTO_GRE
-
config NF_NAT_PPTP
tristate
depends on NF_CONNTRACK
default NF_CONNTRACK_PPTP
- select NF_NAT_PROTO_GRE
config NF_NAT_H323
tristate
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 367993adf4d3..fd7122e0e2c9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,7 +3,7 @@
# Makefile for the netfilter modules on top of IPv4.
#
-nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o
nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
@@ -28,9 +28,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2c8d313ae216..b61977db9b7f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -56,18 +56,15 @@ struct clusterip_config {
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
- struct rcu_head rcu;
-
+ struct rcu_head rcu; /* for call_rcu_bh */
+ struct net *net; /* netns for pernet list */
char ifname[IFNAMSIZ]; /* device ifname */
- struct notifier_block notifier; /* refresh c->ifindex in it */
};
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
#endif
-static unsigned int clusterip_net_id __read_mostly;
-
struct clusterip_net {
struct list_head configs;
/* lock protects the configs list */
@@ -75,51 +72,66 @@ struct clusterip_net {
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *procdir;
+ /* mutex protects the config->pde*/
+ struct mutex mutex;
#endif
};
+static unsigned int clusterip_net_id __read_mostly;
+static inline struct clusterip_net *clusterip_pernet(struct net *net)
+{
+ return net_generic(net, clusterip_net_id);
+}
+
static inline void
clusterip_config_get(struct clusterip_config *c)
{
refcount_inc(&c->refcount);
}
-
static void clusterip_config_rcu_free(struct rcu_head *head)
{
- kfree(container_of(head, struct clusterip_config, rcu));
+ struct clusterip_config *config;
+ struct net_device *dev;
+
+ config = container_of(head, struct clusterip_config, rcu);
+ dev = dev_get_by_name(config->net, config->ifname);
+ if (dev) {
+ dev_mc_del(dev, config->clustermac);
+ dev_put(dev);
+ }
+ kfree(config);
}
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (refcount_dec_and_test(&c->refcount))
- call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
+ call_rcu(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
* entry(rule) is removed, remove the config from lists, but don't free it
* yet, since proc-files could still be holding references */
static inline void
-clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
+clusterip_config_entry_put(struct clusterip_config *c)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(c->net);
local_bh_disable();
if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&cn->lock);
+ local_bh_enable();
/* In case anyone still accesses the file, the open/close
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
+ mutex_lock(&cn->mutex);
if (cn->procdir)
proc_remove(c->pde);
+ mutex_unlock(&cn->mutex);
#endif
- list_del_rcu(&c->list);
- spin_unlock(&cn->lock);
- local_bh_enable();
-
- unregister_netdevice_notifier(&c->notifier);
-
return;
}
local_bh_enable();
@@ -129,7 +141,7 @@ static struct clusterip_config *
__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
@@ -181,32 +193,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct clusterip_net *cn = clusterip_pernet(net);
struct clusterip_config *c;
- c = container_of(this, struct clusterip_config, notifier);
- switch (event) {
- case NETDEV_REGISTER:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- }
- break;
- case NETDEV_UNREGISTER:
- if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
- }
- break;
- case NETDEV_CHANGENAME:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- } else if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
+ spin_lock_bh(&cn->lock);
+ list_for_each_entry_rcu(c, &cn->configs, list) {
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!strcmp(dev->name, c->ifname)) {
+ c->ifindex = dev->ifindex;
+ dev_mc_add(dev, c->clustermac);
+ }
+ break;
+ case NETDEV_UNREGISTER:
+ if (dev->ifindex == c->ifindex) {
+ dev_mc_del(dev, c->clustermac);
+ c->ifindex = -1;
+ }
+ break;
+ case NETDEV_CHANGENAME:
+ if (!strcmp(dev->name, c->ifname)) {
+ c->ifindex = dev->ifindex;
+ dev_mc_add(dev, c->clustermac);
+ } else if (dev->ifindex == c->ifindex) {
+ dev_mc_del(dev, c->clustermac);
+ c->ifindex = -1;
+ }
+ break;
}
- break;
}
+ spin_unlock_bh(&cn->lock);
return NOTIFY_DONE;
}
@@ -215,30 +232,44 @@ static struct clusterip_config *
clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
__be32 ip, const char *iniface)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
struct clusterip_config *c;
+ struct net_device *dev;
int err;
+ if (iniface[0] == '\0') {
+ pr_info("Please specify an interface name\n");
+ return ERR_PTR(-EINVAL);
+ }
+
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
return ERR_PTR(-ENOMEM);
- strcpy(c->ifname, iniface);
- c->ifindex = -1;
- c->clusterip = ip;
+ dev = dev_get_by_name(net, iniface);
+ if (!dev) {
+ pr_info("no such interface %s\n", iniface);
+ kfree(c);
+ return ERR_PTR(-ENOENT);
+ }
+ c->ifindex = dev->ifindex;
+ strcpy(c->ifname, dev->name);
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+ dev_mc_add(dev, c->clustermac);
+ dev_put(dev);
+
+ c->clusterip = ip;
c->num_total_nodes = i->num_total_nodes;
clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
+ c->net = net;
refcount_set(&c->refcount, 1);
spin_lock_bh(&cn->lock);
if (__clusterip_config_find(net, ip)) {
- spin_unlock_bh(&cn->lock);
- kfree(c);
-
- return ERR_PTR(-EBUSY);
+ err = -EBUSY;
+ goto out_config_put;
}
list_add_rcu(&c->list, &cn->configs);
@@ -250,9 +281,11 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
+ mutex_lock(&cn->mutex);
c->pde = proc_create_data(buffer, 0600,
cn->procdir,
&clusterip_proc_fops, c);
+ mutex_unlock(&cn->mutex);
if (!c->pde) {
err = -ENOMEM;
goto err;
@@ -260,22 +293,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
}
#endif
- c->notifier.notifier_call = clusterip_netdev_event;
- err = register_netdevice_notifier(&c->notifier);
- if (!err) {
- refcount_set(&c->entries, 1);
- return c;
- }
+ refcount_set(&c->entries, 1);
+ return c;
#ifdef CONFIG_PROC_FS
- proc_remove(c->pde);
err:
#endif
spin_lock_bh(&cn->lock);
list_del_rcu(&c->list);
+out_config_put:
spin_unlock_bh(&cn->lock);
clusterip_config_put(c);
-
return ERR_PTR(err);
}
@@ -475,34 +503,20 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
&e->ip.dst.s_addr);
return -EINVAL;
} else {
- struct net_device *dev;
-
- if (e->ip.iniface[0] == '\0') {
- pr_info("Please specify an interface name\n");
- return -EINVAL;
- }
-
- dev = dev_get_by_name(par->net, e->ip.iniface);
- if (!dev) {
- pr_info("no such interface %s\n",
- e->ip.iniface);
- return -ENOENT;
- }
- dev_put(dev);
-
config = clusterip_config_init(par->net, cipinfo,
e->ip.dst.s_addr,
e->ip.iniface);
if (IS_ERR(config))
return PTR_ERR(config);
}
- }
+ } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN))
+ return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
- clusterip_config_entry_put(par->net, config);
+ clusterip_config_entry_put(config);
clusterip_config_put(config);
return ret;
}
@@ -524,7 +538,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
/* if no more entries are referencing the config, remove it
* from the list and destroy the proc entry */
- clusterip_config_entry_put(par->net, cipinfo->config);
+ clusterip_config_entry_put(cipinfo->config);
clusterip_config_put(cipinfo->config);
@@ -806,7 +820,7 @@ static const struct file_operations clusterip_proc_fops = {
static int clusterip_net_init(struct net *net)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
int ret;
INIT_LIST_HEAD(&cn->configs);
@@ -824,6 +838,7 @@ static int clusterip_net_init(struct net *net)
pr_err("Unable to proc dir entry\n");
return -ENOMEM;
}
+ mutex_init(&cn->mutex);
#endif /* CONFIG_PROC_FS */
return 0;
@@ -831,13 +846,15 @@ static int clusterip_net_init(struct net *net)
static void clusterip_net_exit(struct net *net)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
+
#ifdef CONFIG_PROC_FS
+ mutex_lock(&cn->mutex);
proc_remove(cn->procdir);
cn->procdir = NULL;
+ mutex_unlock(&cn->mutex);
#endif
nf_unregister_net_hook(net, &cip_arp_ops);
- WARN_ON_ONCE(!list_empty(&cn->configs));
}
static struct pernet_operations clusterip_net_ops = {
@@ -847,6 +864,10 @@ static struct pernet_operations clusterip_net_ops = {
.size = sizeof(struct clusterip_net),
};
+struct notifier_block cip_netdev_notifier = {
+ .notifier_call = clusterip_netdev_event
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
@@ -859,11 +880,17 @@ static int __init clusterip_tg_init(void)
if (ret < 0)
goto cleanup_subsys;
+ ret = register_netdevice_notifier(&cip_netdev_notifier);
+ if (ret < 0)
+ goto unregister_target;
+
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
return 0;
+unregister_target:
+ xt_unregister_target(&clusterip_tg_reg);
cleanup_subsys:
unregister_pernet_subsys(&clusterip_net_ops);
return ret;
@@ -873,11 +900,12 @@ static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+ unregister_netdevice_notifier(&cip_netdev_notifier);
xt_unregister_target(&clusterip_tg_reg);
unregister_pernet_subsys(&clusterip_net_ops);
- /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
- rcu_barrier_bh();
+ /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */
+ rcu_barrier();
}
module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index ce1512b02cb2..fd3f9e8a74da 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -81,9 +81,12 @@ static int __init masquerade_tg_init(void)
int ret;
ret = xt_register_target(&masquerade_tg_reg);
+ if (ret)
+ return ret;
- if (ret == 0)
- nf_nat_masquerade_ipv4_register_notifier();
+ ret = nf_nat_masquerade_ipv4_register_notifier();
+ if (ret)
+ xt_unregister_target(&masquerade_tg_reg);
return ret;
}
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 78a67f961d86..2687db015b6f 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -62,22 +62,8 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
}
#endif /* CONFIG_XFRM */
-static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range2 *range)
-{
- return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
- ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
-}
-
-static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
- __be16 dport)
-{
- return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
-}
-
static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff,
- const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
@@ -90,8 +76,8 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
iph = (void *)skb->data + iphdroff;
hdroff = iphdroff + iph->ihl * 4;
- if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
- target, maniptype))
+ if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff,
+ hdroff, target, maniptype))
return false;
iph = (void *)skb->data + iphdroff;
@@ -161,8 +147,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.l3proto = NFPROTO_IPV4,
- .in_range = nf_nat_ipv4_in_range,
- .secure_port = nf_nat_ipv4_secure_port,
.manip_pkt = nf_nat_ipv4_manip_pkt,
.csum_update = nf_nat_ipv4_csum_update,
.csum_recalc = nf_nat_ipv4_csum_recalc,
@@ -186,7 +170,6 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
unsigned int hdrlen = ip_hdrlen(skb);
- const struct nf_nat_l4proto *l4proto;
struct nf_conntrack_tuple target;
unsigned long statusbit;
@@ -217,9 +200,8 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
if (!(ct->status & statusbit))
return 1;
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
- l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ &ct->tuplehash[!dir].tuple, !manip))
return 0;
if (skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -233,8 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
/* Change outer to look like the reply to an incoming packet */
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
- if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
return 0;
return 1;
@@ -391,26 +372,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
static int __init nf_nat_l3proto_ipv4_init(void)
{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
- if (err < 0)
- goto err1;
- err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
- if (err < 0)
- goto err2;
- return err;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-err1:
- return err;
+ return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
}
static void __exit nf_nat_l3proto_ipv4_exit(void)
{
nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
}
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index a9d5e013e555..41327bb99093 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -147,28 +147,50 @@ static struct notifier_block masq_inet_notifier = {
.notifier_call = masq_inet_event,
};
-static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+static int masq_refcnt;
+static DEFINE_MUTEX(masq_mutex);
-void nf_nat_masquerade_ipv4_register_notifier(void)
+int nf_nat_masquerade_ipv4_register_notifier(void)
{
+ int ret = 0;
+
+ mutex_lock(&masq_mutex);
/* check if the notifier was already set */
- if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
- return;
+ if (++masq_refcnt > 1)
+ goto out_unlock;
/* Register for device down reports */
- register_netdevice_notifier(&masq_dev_notifier);
+ ret = register_netdevice_notifier(&masq_dev_notifier);
+ if (ret)
+ goto err_dec;
/* Register IP address change reports */
- register_inetaddr_notifier(&masq_inet_notifier);
+ ret = register_inetaddr_notifier(&masq_inet_notifier);
+ if (ret)
+ goto err_unregister;
+
+ mutex_unlock(&masq_mutex);
+ return ret;
+
+err_unregister:
+ unregister_netdevice_notifier(&masq_dev_notifier);
+err_dec:
+ masq_refcnt--;
+out_unlock:
+ mutex_unlock(&masq_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
void nf_nat_masquerade_ipv4_unregister_notifier(void)
{
+ mutex_lock(&masq_mutex);
/* check if the notifier still has clients */
- if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
- return;
+ if (--masq_refcnt > 0)
+ goto out_unlock;
unregister_netdevice_notifier(&masq_dev_notifier);
unregister_inetaddr_notifier(&masq_inet_notifier);
+out_unlock:
+ mutex_unlock(&masq_mutex);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 5d259a12e25f..68b4d450391b 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -299,8 +299,6 @@ pptp_inbound_pkt(struct sk_buff *skb,
static int __init nf_nat_helper_pptp_init(void)
{
- nf_nat_need_gre();
-
BUG_ON(nf_nat_pptp_hook_outbound != NULL);
RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
deleted file mode 100644
index 00fda6331ce5..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * nf_nat_proto_gre.c
- *
- * NAT protocol helper module for GRE.
- *
- * GRE is a generic encapsulation protocol, which is generally not very
- * suited for NAT, as it has no protocol-specific part as port numbers.
- *
- * It has an optional key field, which may help us distinguishing two
- * connections between the same two hosts.
- *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
- *
- * PPTP is built on top of a modified version of GRE, and has a mandatory
- * field called "CallID", which serves us for the same purpose as the key
- * field in plain GRE.
- *
- * Documentation about PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-#include <linux/netfilter/nf_conntrack_proto_gre.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
-
-/* generate unique tuple ... */
-static void
-gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t key;
- __be16 *keyptr;
- unsigned int min, i, range_size;
-
- /* If there is no master conntrack we are not PPTP,
- do not change tuples */
- if (!ct->master)
- return;
-
- if (maniptype == NF_NAT_MANIP_SRC)
- keyptr = &tuple->src.u.gre.key;
- else
- keyptr = &tuple->dst.u.gre.key;
-
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
- pr_debug("%p: NATing GRE PPTP\n", ct);
- min = 1;
- range_size = 0xffff;
- } else {
- min = ntohs(range->min_proto.gre.key);
- range_size = ntohs(range->max_proto.gre.key) - min + 1;
- }
-
- pr_debug("min = %u, range_size = %u\n", min, range_size);
-
- for (i = 0; ; ++key) {
- *keyptr = htons(min + key % range_size);
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
-
- pr_debug("%p: no NAT mapping\n", ct);
- return;
-}
-
-/* manipulate a GRE packet according to maniptype */
-static bool
-gre_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct gre_base_hdr *greh;
- struct pptp_gre_header *pgreh;
-
- /* pgreh includes two optional 32bit fields which are not required
- * to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
- return false;
-
- greh = (void *)skb->data + hdroff;
- pgreh = (struct pptp_gre_header *)greh;
-
- /* we only have destination manip of a packet, since 'source key'
- * is not present in the packet itself */
- if (maniptype != NF_NAT_MANIP_DST)
- return true;
-
- switch (greh->flags & GRE_VERSION) {
- case GRE_VERSION_0:
- /* We do not currently NAT any GREv0 packets.
- * Try to behave like "nf_nat_proto_unknown" */
- break;
- case GRE_VERSION_1:
- pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
- pgreh->call_id = tuple->dst.u.gre.key;
- break;
- default:
- pr_debug("can't nat unknown GRE version\n");
- return false;
- }
- return true;
-}
-
-static const struct nf_nat_l4proto gre = {
- .l4proto = IPPROTO_GRE,
- .manip_pkt = gre_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = gre_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_gre_init(void)
-{
- return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
-}
-
-static void __exit nf_nat_proto_gre_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
-}
-
-module_init(nf_nat_proto_gre_init);
-module_exit(nf_nat_proto_gre_fini);
-
-void nf_nat_need_gre(void)
-{
- return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
deleted file mode 100644
index 6d7cf1d79baf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool
-icmp_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
- ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
-}
-
-static void
-icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
- /* If no range specified... */
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xFFFF;
-
- for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
- (id % range_size));
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
- return;
-}
-
-static bool
-icmp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct icmphdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct icmphdr *)(skb->data + hdroff);
- inet_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, false);
- hdr->un.echo.id = tuple->src.u.icmp.id;
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
- .l4proto = IPPROTO_ICMP,
- .manip_pkt = icmp_manip_pkt,
- .in_range = icmp_in_range,
- .unique_tuple = icmp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 5cd06ba3535d..aa8304c618b8 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -102,6 +102,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
+ struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
struct iphdr *niph;
const struct tcphdr *oth;
@@ -147,10 +148,11 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- if (oldskb->nf_bridge) {
+ br_indev = nf_bridge_get_physindev(oldskb);
+ if (br_indev) {
struct ethhdr *oeth = eth_hdr(oldskb);
- nskb->dev = nf_bridge_get_physindev(oldskb);
+ nskb->dev = br_indev;
niph->tot_len = htons(nskb->len);
ip_send_check(niph);
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f1193e1e928a..6847de1d1db8 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -69,7 +69,9 @@ static int __init nft_masq_ipv4_module_init(void)
if (ret < 0)
return ret;
- nf_nat_masquerade_ipv4_register_notifier();
+ ret = nf_nat_masquerade_ipv4_register_notifier();
+ if (ret)
+ nft_unregister_expr(&nft_masq_ipv4_type);
return ret;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 70289682a670..c3610b37bb4c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..92d249e053be 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
#include <net/protocol.h>
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..c55a5432cf37 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -391,7 +390,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
@@ -805,7 +804,7 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
@@ -970,7 +969,7 @@ struct proto raw_prot = {
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
@@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void)
unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c0a9d26c06ce..ce92f73cf104 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev,
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
skb_mac_header(skb),
- dev->hard_header_len, true);
+ dev->hard_header_len, false);
}
}
#endif
@@ -2849,6 +2849,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
err = -rt->dst.error;
} else {
fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ skb->dev = net->loopback_dev;
rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 891ed2f91467..ba0fc4b18465 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_ecn",
.data = &init_net.ipv4.sysctl_tcp_ecn,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9e6bc4d6daa7..27e2f6837062 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1423,7 +1423,7 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg);
+ sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
@@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
@@ -2147,7 +2147,7 @@ skip_copy:
sk_eat_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
@@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state)
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-#endif
}
EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
0;
}
@@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
TCP_NLA_PAD);
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
return stats;
}
@@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void)
if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);
- if (!tcp_md5sig_pool_populated)
+ if (!tcp_md5sig_pool_populated) {
__tcp_alloc_md5sig_pool();
+ if (tcp_md5sig_pool_populated)
+ static_key_slow_inc(&tcp_md5_needed);
+ }
mutex_unlock(&tcp_md5sig_mutex);
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 9277abdd822a..0f497fc49c3f 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
-/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
static const int bbr_pacing_margin_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
@@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3b45fe530f91..1bb7321a256d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -8,6 +8,7 @@
#include <linux/wait.h>
#include <net/inet_common.h>
+#include <net/tls.h>
static bool tcp_bpf_stream_read(const struct sock *sk)
{
@@ -198,7 +199,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
msg->sg.start = i;
msg->sg.size -= apply_bytes;
sk_psock_queue_msg(psock, tmp);
- sk->sk_data_ready(sk);
+ sk_psock_data_ready(sk, psock);
} else {
sk_msg_free(sk, tmp);
kfree(tmp);
@@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
u32 off;
while (1) {
+ bool has_tx_ulp;
+
sge = sk_msg_elem(msg, msg->sg.start);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
@@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
tcp_rate_check_app_limited(sk);
retry:
- ret = do_tcp_sendpages(sk, page, off, size, flags);
+ has_tx_ulp = tls_sw_has_ctx_tx(sk);
+ if (has_tx_ulp) {
+ flags |= MSG_SENDPAGE_NOPOLICY;
+ ret = kernel_sendpage_locked(sk,
+ page, off, size, flags);
+ } else {
+ ret = do_tcp_sendpages(sk, page, off, size, flags);
+ }
+
if (ret <= 0)
return ret;
if (apply)
@@ -289,12 +300,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
{
bool cork = false, enospc = msg->sg.start == msg->sg.end;
struct sock *sk_redir;
- u32 tosend;
+ u32 tosend, delta = 0;
int ret;
more_data:
- if (psock->eval == __SK_NONE)
+ if (psock->eval == __SK_NONE) {
+ /* Track delta in msg size to add/subtract it on SK_DROP from
+ * returned to user copied size. This ensures user doesn't
+ * get a positive return code with msg_cut_data and SK_DROP
+ * verdict.
+ */
+ delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (msg->sg.size < delta)
+ delta -= msg->sg.size;
+ else
+ delta = 0;
+ }
if (msg->cork_bytes &&
msg->cork_bytes > msg->sg.size && !enospc) {
@@ -350,7 +372,7 @@ more_data:
default:
sk_msg_free_partial(sk, msg, tosend);
sk_msg_apply_bytes(psock, tosend);
- *copied -= tosend;
+ *copied -= (tosend + delta);
return -EACCES;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1e37c1388189..76858b14ebe9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
- if (!delta)
- delta = 1;
- delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- tcp_rcv_rtt_update(tp, delta_us, 0);
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ if (!delta)
+ delta = 1;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ tcp_rcv_rtt_update(tp, delta_us, 0);
+ }
}
}
@@ -1863,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 prior_sacked = tp->sacked_out;
+ if (num_dupack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ s32 delivered;
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- if (tp->sacked_out > prior_sacked)
- tp->delivered++; /* Some out-of-order packet is delivered */
- tcp_verify_left_out(tp);
+ tp->sacked_out += num_dupack;
+ tcp_check_reno_reordering(sk, 0);
+ delivered = tp->sacked_out - prior_sacked;
+ if (delivered > 0)
+ tp->delivered += delivered;
+ tcp_verify_left_out(tp);
+ }
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
@@ -2457,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
- !(flag & FLAG_LOST_RETRANS)) {
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
+ FLAG_RETRANS_DATA_ACKED) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
@@ -2634,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2653,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
return;
if (after(tp->snd_nxt, tp->high_seq)) {
- if (flag & FLAG_DATA_SACKED || is_dupack)
+ if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
@@ -2679,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
*/
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+ tcp_add_reno_sack(sk, num_dupack);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
@@ -2757,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
- bool is_dupack, int *ack_flag, int *rexmit)
+ int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
+ bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
+ tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
@@ -2810,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
- if (tcp_is_reno(tp) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (tcp_is_reno(tp))
+ tcp_add_reno_sack(sk, num_dupack);
} else {
if (tcp_try_undo_partial(sk, prior_snd_una))
return;
@@ -2826,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack, rexmit);
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
@@ -2837,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- if (is_dupack)
- tcp_add_reno_sack(sk);
+ tcp_add_reno_sack(sk, num_dupack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
@@ -2910,9 +2915,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- seq_rtt_us = ca_rtt_us = delta_us;
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ ca_rtt_us = seq_rtt_us;
+ }
}
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
@@ -3558,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- bool is_dupack = false;
+ int num_dupack = 0;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
@@ -3610,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
@@ -3668,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
+ num_dupack = 1;
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
+ if (!(flag & FLAG_DATA))
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
}
@@ -3687,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
@@ -3712,7 +3725,7 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
@@ -4602,13 +4615,12 @@ end:
}
}
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
- bool *fragstolen)
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
- __skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail,
skb, fragstolen)) ? 1 : 0;
@@ -4659,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4719,7 +4731,7 @@ queue_and_out:
goto drop;
}
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5595,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
+ __skb_pull(skb, tcp_header_len);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..efc6fef692ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_rtx_queue_head(sk);
- BUG_ON(!skb);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
@@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
* We need to maintain these in the sk structure.
*/
+struct static_key tcp_md5_needed __read_mostly;
+EXPORT_SYMBOL(tcp_md5_needed);
+
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
- const union tcp_md5_addr *addr,
- int family)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
}
return best_match;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
@@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
- /* Only socket owner can try to collapse/prune rx queues
- * to reduce memory overhead, so add a little headroom here.
- * Few sockets backlog are possibly concurrently non empty.
- */
- limit += 64*1024;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
*/
skb_condense(skb);
+ skb_dst_drop(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+ shinfo = skb_shinfo(skb);
+
+ if (!shinfo->gso_size)
+ shinfo->gso_size = skb->len - hdrlen;
+
+ if (!shinfo->gso_segs)
+ shinfo->gso_segs = 1;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+ tail->decrypted != skb->decrypted ||
+#endif
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ thtail->window = th->window;
+
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+ skb_shinfo(tail)->gso_size);
+
+ gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+ skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64*1024;
+
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
@@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* Default TSQ limit of 16 TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 870b0a335061..0fbf7d4df9da 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -10,6 +10,7 @@
* TCPv4 GSO/GRO support
*/
+#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
#include <net/tcp.h>
#include <net/protocol.h>
@@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
@@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *
return tcp_gro_receive(head, skb);
}
-static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3f510cad0b3e..730bc44dbad9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- (*rcv_wscale) = 0;
+ *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
- space >>= 1;
- (*rcv_wscale)++;
- }
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -1904,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
- bool *is_cwnd_limited, u32 max_segs)
+ bool *is_cwnd_limited,
+ bool *is_rwnd_limited,
+ u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
-
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- goto send_now;
+ s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1968,15 +1971,33 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+ delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
- /* Ok, it looks like it is advisable to defer. */
+ /* Ok, it looks like it is advisable to defer.
+ * Three cases are tracked :
+ * 1) We are cwnd-limited
+ * 2) We are rwnd-limited
+ * 3) We are application limited.
+ */
+ if (cong_win < send_win) {
+ if (cong_win <= skb->len) {
+ *is_cwnd_limited = true;
+ return true;
+ }
+ } else {
+ if (send_win <= skb->len) {
+ *is_rwnd_limited = true;
+ return true;
+ }
+ }
- if (cong_win < send_win && cong_win <= skb->len)
- *is_cwnd_limited = true;
+ /* If this packet won't get more data, do not wait. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+ TCP_SKB_CB(skb)->eor)
+ goto send_now;
return true;
@@ -2212,8 +2233,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(unsigned long, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ limit = min_t(unsigned long, limit,
+ sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
@@ -2356,7 +2378,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
} else {
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
- max_segs))
+ &is_rwnd_limited, max_segs))
break;
}
@@ -2494,15 +2516,18 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
}
skb = skb_rb_last(&sk->tcp_rtx_queue);
+ if (unlikely(!skb)) {
+ WARN_ONCE(tp->packets_out,
+ "invalid inflight: %u state %u cwnd %u mss %d\n",
+ tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+ inet_csk(sk)->icsk_pending = 0;
+ return;
+ }
/* At most one outstanding TLP retransmission. */
if (tp->tlp_high_seq)
goto rearm_timer;
- /* Retransmit last segment. */
- if (WARN_ON(!skb))
- goto rearm_timer;
-
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
@@ -2920,7 +2945,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
}
return err;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5f8b6d3cd855..f87dbc78b6bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
+ s32 remaining;
start_ts = tcp_retransmit_stamp(sk);
if (!icsk->icsk_user_timeout || !start_ts)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
- if (elapsed >= icsk->icsk_user_timeout)
+ remaining = icsk->icsk_user_timeout - elapsed;
+ if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
- else
- return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}
/**
@@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk,
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
timeout = jiffies_to_msecs(timeout);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
+ return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
/* A write timeout has occurred. Process the after effects. */
@@ -376,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk)
return;
}
- if (icsk->icsk_probes_out > max_probes) {
+ if (icsk->icsk_probes_out >= max_probes) {
abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -482,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk)
goto out_reset_timer;
}
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
if (tcp_write_timeout(sk))
goto out;
if (icsk->icsk_retransmits == 0) {
- int mib_idx;
+ int mib_idx = 0;
if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
@@ -501,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk)
mib_idx = LINUX_MIB_TCPSACKFAILURES;
else
mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else {
- mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ if (mib_idx)
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ drop:
}
#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1976fddb9e00..3fb0ed5e4789 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -115,6 +116,7 @@
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -371,21 +373,19 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
ipv6_only_sock(sk))
return -1;
- score = (sk->sk_family == PF_INET) ? 2 : 1;
- inet = inet_sk(sk);
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
- if (inet->inet_rcv_saddr) {
- if (inet->inet_rcv_saddr != daddr)
- return -1;
- score += 4;
- }
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
@@ -398,15 +398,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -465,65 +461,30 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif,
int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
- struct sock *sk, *result;
+ struct sock *result;
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
bool exact_dif = udp_lib_exact_dif_match(net, skb);
- int score, badness;
- u32 hash = 0;
- if (hslot->count > 10) {
- hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ exact_dif, hslot2, skb);
+ if (!result) {
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
+ htonl(INADDR_ANY), hnum, dif, sdif,
exact_dif, hslot2, skb);
- if (!result) {
- unsigned int old_slot2 = slot2;
- hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
- slot2 = hash2 & udptable->mask;
- /* avoid searching the same slot again. */
- if (unlikely(slot2 == old_slot2))
- return result;
-
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
- }
- if (unlikely(IS_ERR(result)))
- return NULL;
- return result;
- }
-begin:
- result = NULL;
- badness = 0;
- sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
- if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (unlikely(IS_ERR(result)))
- return NULL;
- if (result)
- return result;
- }
- result = sk;
- badness = score;
- }
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -585,6 +546,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+ static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+
+ if (!iptun_encaps[i])
+ continue;
+ handler = rcu_dereference(iptun_encaps[i]->err_handler);
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb, u32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -596,13 +640,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -612,8 +657,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+ info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
err = 0;
@@ -656,6 +714,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ goto out;
+ }
if (!inet->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -665,12 +727,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, &udp_table);
}
/*
@@ -1713,6 +1775,10 @@ try_again:
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
+
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
@@ -1889,13 +1955,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
- static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
/* returns:
* -1: error
* 0: success
@@ -1904,7 +1963,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2007,6 +2066,27 @@ drop:
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+ }
+ return 0;
+}
+
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -2398,11 +2478,15 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_dec(&udp_encap_needed_key);
}
}
@@ -2447,7 +2531,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
- udp_encap_enable();
+ lock_sock(sk);
+ udp_tunnel_encap_enable(sk->sk_socket);
+ release_sock(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2469,6 +2555,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->gso_size = val;
break;
+ case UDP_GRO:
+ lock_sock(sk);
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+ up->gro_enabled = valbool;
+ release_sock(sk);
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
#include <net/inet_common.h>
int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..64f9715173ac 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -13,6 +13,7 @@
#include <linux/skbuff.h>
#include <net/udp.h>
#include <net/protocol.h>
+#include <net/inet_common.h>
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -343,6 +344,56 @@ out:
return segs;
}
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to execessive truesize values
+ */
+ if (!skb_gro_receive(p, skb) &&
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+ else if (uh->len != uh2->len)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
+INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup)
{
@@ -353,23 +404,28 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
int flush = 1;
struct sock *sk;
+ rcu_read_lock();
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (!sk)
+ goto out_unlock;
+
+ if (udp_sk(sk)->gro_enabled) {
+ pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+ rcu_read_unlock();
+ return pp;
+ }
+
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
- !NAPI_GRO_CB(skb)->csum_valid))
- goto out;
+ !NAPI_GRO_CB(skb)->csum_valid) ||
+ !udp_sk(sk)->gro_receive)
+ goto out_unlock;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
-
- if (sk && udp_sk(sk)->gro_receive)
- goto unflush;
- goto out_unlock;
-
-unflush:
flush = 0;
list_for_each_entry(p, head, list) {
@@ -394,14 +450,13 @@ unflush:
out_unlock:
rcu_read_unlock();
-out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff *udp4_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
@@ -427,6 +482,19 @@ flush:
return NULL;
}
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+ return 0;
+}
+
int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup)
{
@@ -437,16 +505,22 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- /* Set encapsulation before calling into inner gro_complete() functions
- * to make them set up the inner offsets.
- */
- skb->encapsulation = 1;
-
rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_complete)
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_enabled) {
+ err = udp_gro_complete_segment(skb);
+ } else if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ }
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -456,18 +530,14 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
}
EXPORT_SYMBOL(udp_gro_complete);
-static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..be8b5b2157d8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
+ if (cfg->bind_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, cfg->bind_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+ dev->name, strlen(dev->name) + 1);
+ dev_put(dev);
+
+ if (err < 0)
+ goto error;
+ }
+
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udplite_table);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(esp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ah4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ipcomp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct net_protocol esp4_protocol = {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 045597b9a7c0..521e471f1cf9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2820,7 +2820,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg)
dev = __dev_get_by_name(net, p.name);
if (!dev)
goto err_exit;
- err = dev_open(dev);
+ err = dev_open(dev, NULL);
}
}
#endif
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 94999058e110..cca3b3603c42 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -433,7 +433,6 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
const struct in6_addr *addr)
{
- unsigned int hash = inet6_acaddr_hash(net, addr);
struct net_device *nh_dev;
struct ifacaddr6 *aca;
bool found = false;
@@ -441,7 +440,9 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
- else
+ else {
+ unsigned int hash = inet6_acaddr_hash(net, addr);
+
hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
aca_addr_lst) {
nh_dev = fib6_info_nh_dev(aca->aca_rt);
@@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
break;
}
}
+ }
rcu_read_unlock();
return found;
}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..bde08aa549f3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
case IPV6_2292PKTINFO:
{
struct net_device *dev = NULL;
+ int src_idx;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
@@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ src_idx = src_info->ipi6_ifindex;
- if (src_info->ipi6_ifindex) {
+ if (src_idx) {
if (fl6->flowi6_oif &&
- src_info->ipi6_ifindex != fl6->flowi6_oif)
+ src_idx != fl6->flowi6_oif &&
+ (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+ !sk_dev_equal_l3scope(sk, src_idx)))
return -EINVAL;
- fl6->flowi6_oif = src_info->ipi6_ifindex;
+ fl6->flowi6_oif = src_idx;
}
addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 63b2b66f9dfa..5afe9f83374d 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -145,10 +145,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)
void *tmp;
struct xfrm_state *x;
- if (xo && (xo->flags & XFRM_DEV_RESUME))
- x = skb->sp->xvec[skb->sp->len - 1];
- else
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
x = skb_dst(skb)->xfrm;
+ }
tmp = ESP_SKB_CB(skb)->tmp;
esp_ssg_unref(x, tmp);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 6177e2171171..d46b4eb645c2 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -68,11 +68,12 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
xo = xfrm_offload(skb);
if (!xo || !(xo->flags & CRYPTO_DONE)) {
- err = secpath_set(skb);
- if (err)
+ struct sec_path *sp = secpath_set(skb);
+
+ if (!sp)
goto out;
- if (skb->sp->len == XFRM_MAX_DEPTH)
+ if (sp->len == XFRM_MAX_DEPTH)
goto out;
x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
@@ -81,8 +82,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
if (!x)
goto out;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
xo = xfrm_offload(skb);
if (!xo) {
@@ -141,6 +142,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
struct crypto_aead *aead;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return ERR_PTR(-EINVAL);
@@ -148,7 +150,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
return ERR_PTR(-EINVAL);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
index 6de3c04b0f30..bd675c61deb1 100644
--- a/net/ipv6/fou6.c
+++ b/net/ipv6/fou6.c
@@ -4,6 +4,7 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/udp.h>
+#include <linux/icmpv6.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/fou.h>
@@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue6_err_proto_handler(int proto, struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info)
+{
+ const struct inet6_protocol *ipprot;
+
+ ipprot = rcu_dereference(inet6_protos[proto]);
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt,
+ type, code, offset, info);
+ goto out;
+ case 6:
+ ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt,
+ type, code, offset, info);
+ goto out;
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+ ret = gue6_err_proto_handler(guehdr->proto_ctype, skb,
+ opt, type, code, offset, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
+
static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou6_build_header,
+ .err_handler = gue6_err,
};
static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue6_build_header,
+ .err_handler = gue6_err,
};
static int ip6_tnl_encap_add_fou_ops(void)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c9c53ade55c3..5d7aa2c2770c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net)
return net->ipv6.icmp_sk[smp_processor_id()];
}
-static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
@@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
ping_err(skb, offset, ntohl(info));
+
+ return 0;
}
static int icmpv6_rcv(struct sk_buff *skb);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..f3515ebe9b3a 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -102,22 +102,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
score = 1;
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
- score++;
- }
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
@@ -166,26 +157,12 @@ struct sock *inet6_lookup_listener(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- bool exact_dif = inet6_exact_dif_match(net, skb);
struct inet_listen_hashbucket *ilb2;
- struct sock *sk, *result = NULL;
- int score, hiscore = 0;
+ struct sock *result = NULL;
unsigned int hash2;
- u32 phash = 0;
-
- if (ilb->count <= 10 || !hashinfo->lhash2)
- goto port_lookup;
-
- /* Too many sk in the ilb bucket (which is hashed by port alone).
- * Try lhash2 (which is hashed by port and addr) instead.
- */
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
@@ -194,33 +171,12 @@ struct sock *inet6_lookup_listener(struct net *net,
goto done;
/* Lookup lhash2 with in6addr_any */
-
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
+ saddr, sport, &in6addr_any, hnum,
dif, sdif);
- goto done;
-
-port_lookup:
- sk_for_each(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
- if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- goto done;
- }
- result = sk;
- hiscore = score;
- }
- }
done:
if (unlikely(IS_ERR(result)))
return NULL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 515adbdba1d2..229e55c99021 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
}
-static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
@@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
offset) < 0)
- return;
+ return -EINVAL;
ipv6h = (const struct ipv6hdr *)skb->data;
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
tpi.key, tpi.proto);
if (!t)
- return;
+ return -ENOENT;
switch (type) {
struct ipv6_tlv_tnl_enc_lim *tel;
@@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
t->parms.name);
if (code != ICMPV6_PORT_UNREACH)
break;
- return;
+ return 0;
case ICMPV6_TIME_EXCEED:
if (code == ICMPV6_EXC_HOPLIMIT) {
net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
t->parms.name);
break;
}
- return;
+ return 0;
case ICMPV6_PARAMPROB:
teli = 0;
if (code == ICMPV6_HDR_FIELD)
@@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
t->parms.name);
}
- return;
+ return 0;
case ICMPV6_PKT_TOOBIG:
ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
- return;
+ return 0;
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
- return;
+ return 0;
}
if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
@@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
@@ -1883,12 +1885,6 @@ static void ip6gre_tap_setup(struct net_device *dev)
netif_keep_dst(dev);
}
-bool is_ip6gretap_dev(const struct net_device *dev)
-{
- return dev->netdev_ops == &ip6gre_tap_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(is_ip6gretap_dev);
-
static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
struct ip_tunnel_encap *ipencap)
{
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..c7ed2b6d5a1d 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
list_for_each_entry_safe(skb, next, head, list) {
struct dst_entry *dst;
- list_del(&skb->list);
+ skb_list_del_init(skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- list_del(&skb->list);
+ skb_list_del_init(skb);
skb = ip6_rcv_core(skb, dev, net);
if (skb == NULL)
continue;
@@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
/*
* Deliver the packet to the host
*/
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+ bool have_final)
{
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
- int nexthdr;
bool raw;
- bool have_final = false;
/*
* Parse extension headers
*/
- rcu_read_lock();
resubmit:
idev = ip6_dst_idev(skb_dst(skb));
- if (!pskb_pull(skb, skb_transport_offset(skb)))
- goto discard;
nhoff = IP6CB(skb)->nhoff;
- nexthdr = skb_network_header(skb)[nhoff];
+ if (!have_final) {
+ if (!pskb_pull(skb, skb_transport_offset(skb)))
+ goto discard;
+ nexthdr = skb_network_header(skb)[nhoff];
+ }
resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
@@ -359,6 +357,8 @@ resubmit_final:
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+ int sdif = inet6_sdif(skb);
+ struct net_device *dev;
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +371,19 @@ resubmit_final:
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(net, sdif);
+ if (!dev)
+ goto discard;
+ } else {
+ dev = skb->dev;
+ }
+
if (ipv6_addr_is_multicast(&hdr->daddr) &&
- !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
- &hdr->saddr) &&
+ !ipv6_chk_mcast_addr(dev, &hdr->daddr,
+ &hdr->saddr) &&
!ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
goto discard;
}
@@ -411,13 +421,19 @@ resubmit_final:
consume_skb(skb);
}
}
- rcu_read_unlock();
- return 0;
+ return;
discard:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
- rcu_read_unlock();
kfree_skb(skb);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ rcu_read_lock();
+ ip6_protocol_deliver_rcu(net, skb, 0, false);
+ rcu_read_unlock();
+
return 0;
}
@@ -432,15 +448,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
int ip6_mc_input(struct sk_buff *skb)
{
+ int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+ struct net_device *dev;
bool deliver;
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
__in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
skb->len);
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+ if (!dev) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ } else {
+ dev = skb->dev;
+ }
+
hdr = ipv6_hdr(skb);
- deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+ deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);
+ if (sdif)
+ rcu_read_unlock();
#ifdef CONFIG_IPV6_MROUTE
/*
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index c7e495f12011..5c045691c302 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -20,6 +20,23 @@
#include "ip6_offload.h"
+/* All GRO functions are always builtin, except UDP over ipv6, which lays in
+ * ipv6 module, as it depends on UDPv6 lookup function, so we need special care
+ * when ipv6 is built as a module
+ */
+#if IS_BUILTIN(CONFIG_IPV6)
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+
+#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \
+({ \
+ unlikely(gro_recursion_inc_test(skb)) ? \
+ NAPI_GRO_CB(skb)->flush |= 1, NULL : \
+ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
+})
+
static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
{
const struct net_offload *ops = NULL;
@@ -164,8 +181,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-static struct sk_buff *ipv6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff *pp = NULL;
@@ -229,14 +250,21 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
* XXX skbs on the gro_list have all been parsed and pulled
* already so we don't need to compare nlen
* (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
- * memcmp() alone below is suffcient, right?
+ * memcmp() alone below is sufficient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
- memcmp(&iph->nexthdr, &iph2->nexthdr,
- nlen - offsetof(struct ipv6hdr, nexthdr))) {
+ !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+ !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+ *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+not_same_flow:
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
+ if (unlikely(nlen > sizeof(struct ipv6hdr))) {
+ if (memcmp(iph + 1, iph2 + 1,
+ nlen - sizeof(struct ipv6hdr)))
+ goto not_same_flow;
+ }
/* flush if Traffic Class fields are different */
NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
NAPI_GRO_CB(p)->flush |= flush;
@@ -253,7 +281,8 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
skb_gro_postpull_rcsum(skb, iph, nlen);
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
@@ -294,7 +323,9 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
return inet_gro_receive(head, skb);
}
-static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
@@ -313,7 +344,8 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
- err = ops->callbacks.gro_complete(skb, nhoff);
+ err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
+ udp6_gro_complete, skb, nhoff);
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89e0d5118afe..5f9fa0302b5a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
+ unsigned int head_room;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
int hlimit = -1;
u32 mtu;
- if (opt) {
- unsigned int head_room;
+ head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+ if (opt)
+ head_room += opt->opt_nflen + opt->opt_flen;
- /* First: exthdrs may take lots of space (~8K for now)
- MAX_HEADER is not enough.
- */
- head_room = opt->opt_nflen + opt->opt_flen;
- seg_len += head_room;
- head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
-
- if (skb_headroom(skb) < head_room) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
- if (!skb2) {
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
- return -ENOBUFS;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
+ if (unlikely(skb_headroom(skb) < head_room)) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+ if (!skb2) {
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -ENOBUFS;
}
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ consume_skb(skb);
+ skb = skb2;
+ }
+
+ if (opt) {
+ seg_len += opt->opt_nflen + opt->opt_flen;
+
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
+
if (opt->opt_nflen)
ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
&fl6->saddr);
@@ -378,6 +378,14 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+
+ skb->tstamp = 0;
return dst_output(net, sk, skb);
}
@@ -574,6 +582,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
skb_copy_secmark(to, from);
}
@@ -1245,6 +1254,7 @@ static int __ip6_append_data(struct sock *sk,
{
struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+ struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
int dst_exthdrlen = 0;
int hh_len;
@@ -1257,7 +1267,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
- bool paged;
+ bool paged, extra_uref;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1322,6 +1332,20 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = true;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1354,7 +1378,7 @@ emsgsize:
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
- unsigned int pagedlen = 0;
+ unsigned int pagedlen;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1378,6 +1402,7 @@ alloc_new_skb:
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
@@ -1439,12 +1464,6 @@ alloc_new_skb:
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
dst_exthdrlen);
- /* Only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes
*/
@@ -1476,6 +1495,13 @@ alloc_new_skb:
exthdrlen = 0;
dst_exthdrlen = 0;
+ /* Only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1505,7 +1531,7 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1535,6 +1561,10 @@ alloc_new_skb:
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1547,6 +1577,8 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ if (uarg)
+ sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a9d06d4dd057..99179b9c8384 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -901,6 +901,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
+ ipv6h = ipv6_hdr(skb);
if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr))
goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b283f293ee4a..ad1a9ccd4b44 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -15,7 +15,7 @@
int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
{
- struct sockaddr_in6 udp6_addr;
+ struct sockaddr_in6 udp6_addr = {};
int err;
struct socket *sock = NULL;
@@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
}
+ if (cfg->bind_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, cfg->bind_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+ dev->name, strlen(dev->name) + 1);
+ dev_put(dev);
+
+ if (err < 0)
+ goto error;
+ }
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
@@ -42,6 +58,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
goto error;
if (cfg->peer_udp_port) {
+ memset(&udp6_addr, 0, sizeof(udp6_addr));
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
sizeof(udp6_addr.sin6_addr));
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index eeaf7455d51e..706fe42e4928 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -318,6 +318,7 @@ static int vti6_rcv(struct sk_buff *skb)
return 0;
}
+ ipv6h = ipv6_hdr(skb);
if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
t->dev->stats.rx_dropped++;
rcu_read_unlock();
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e2ea691e42c6..8276f1224f16 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -52,6 +52,8 @@
#include <net/ip6_checksum.h>
#include <linux/netconf.h>
+#include <linux/nospec.h>
+
struct ip6mr_rule {
struct fib_rule common;
};
@@ -655,7 +657,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
return NULL;
}
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -1841,6 +1843,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
return -EFAULT;
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
+ vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
@@ -1915,6 +1918,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return -EFAULT;
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
+ vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
@@ -1968,7 +1972,7 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
*/
static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *c, int vifi)
+ struct sk_buff *skb, int vifi)
{
struct ipv6hdr *ipv6h;
struct vif_device *vif = &mrt->vif_table[vifi];
@@ -2134,15 +2138,14 @@ forward:
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ip6mr_forward2(net, mrt, skb2,
- c, psend);
+ ip6mr_forward2(net, mrt, skb2, psend);
}
psend = ct;
}
}
last_forward:
if (psend != -1) {
- ip6mr_forward2(net, mrt, skb, c, psend);
+ ip6mr_forward2(net, mrt, skb, psend);
return;
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 381ce38940ae..973e215c3114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ sticky_done:
retv = -EFAULT;
break;
}
- if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+ if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 5ae8e1c51079..8b075f0bc351 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -24,7 +24,8 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
unsigned int hh_len;
struct dst_entry *dst;
struct flowi6 fl6 = {
- .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
+ .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if :
+ rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0,
.flowi6_mark = skb->mark,
.flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 200c0c235565..9ea43d5256e0 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
-nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o
nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 491f808e356a..29c7f1915a96 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -58,8 +58,12 @@ static int __init masquerade_tg6_init(void)
int err;
err = xt_register_target(&masquerade_tg6_reg);
- if (err == 0)
- nf_nat_masquerade_ipv6_register_notifier();
+ if (err)
+ return err;
+
+ err = nf_nat_masquerade_ipv6_register_notifier();
+ if (err)
+ xt_unregister_target(&masquerade_tg6_reg);
return err;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index d219979c3e52..181da2c40f9a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -341,7 +341,7 @@ static bool
nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev)
{
struct sk_buff *fp, *head = fq->q.fragments;
- int payload_len;
+ int payload_len, delta;
u8 ecn;
inet_frag_kill(&fq->q);
@@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
return false;
}
+ delta = - head->truesize;
+
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
return false;
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(fq->q.net, delta);
+
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index ca6d38698b1a..23022447eb49 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -61,22 +61,8 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
}
#endif
-static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range2 *range)
-{
- return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
- ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
-}
-
-static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t,
- __be16 dport)
-{
- return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport);
-}
-
static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff,
- const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
@@ -96,8 +82,8 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
goto manip_addr;
if ((frag_off & htons(~0x7)) == 0 &&
- !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff,
- target, maniptype))
+ !nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff,
+ target, maniptype))
return false;
/* must reload, offset might have changed */
@@ -171,8 +157,6 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
.l3proto = NFPROTO_IPV6,
- .secure_port = nf_nat_ipv6_secure_port,
- .in_range = nf_nat_ipv6_in_range,
.manip_pkt = nf_nat_ipv6_manip_pkt,
.csum_update = nf_nat_ipv6_csum_update,
.csum_recalc = nf_nat_ipv6_csum_recalc,
@@ -196,7 +180,6 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
} *inside;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
- const struct nf_nat_l4proto *l4proto;
struct nf_conntrack_tuple target;
unsigned long statusbit;
@@ -227,9 +210,8 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
if (!(ct->status & statusbit))
return 1;
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr);
if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
- l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ &ct->tuplehash[!dir].tuple, !manip))
return 0;
if (skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -244,8 +226,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
}
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6);
- if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip))
+ if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip))
return 0;
return 1;
@@ -415,26 +396,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
static int __init nf_nat_l3proto_ipv6_init(void)
{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
- if (err < 0)
- goto err1;
- err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6);
- if (err < 0)
- goto err2;
- return err;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
-err1:
- return err;
+ return nf_nat_l3proto_register(&nf_nat_l3proto_ipv6);
}
static void __exit nf_nat_l3proto_ipv6_exit(void)
{
nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6);
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
}
MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 3e4bf2286abe..0ad0da5a2600 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -132,8 +132,8 @@ static void iterate_cleanup_work(struct work_struct *work)
* of ipv6 addresses being deleted), we also need to add an upper
* limit to the number of queued work items.
*/
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event, void *ptr)
+static int masq_inet6_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
{
struct inet6_ifaddr *ifa = ptr;
const struct net_device *dev;
@@ -171,30 +171,53 @@ static int masq_inet_event(struct notifier_block *this,
return NOTIFY_DONE;
}
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
+static struct notifier_block masq_inet6_notifier = {
+ .notifier_call = masq_inet6_event,
};
-static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+static int masq_refcnt;
+static DEFINE_MUTEX(masq_mutex);
-void nf_nat_masquerade_ipv6_register_notifier(void)
+int nf_nat_masquerade_ipv6_register_notifier(void)
{
+ int ret = 0;
+
+ mutex_lock(&masq_mutex);
/* check if the notifier is already set */
- if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
- return;
+ if (++masq_refcnt > 1)
+ goto out_unlock;
+
+ ret = register_netdevice_notifier(&masq_dev_notifier);
+ if (ret)
+ goto err_dec;
+
+ ret = register_inet6addr_notifier(&masq_inet6_notifier);
+ if (ret)
+ goto err_unregister;
- register_netdevice_notifier(&masq_dev_notifier);
- register_inet6addr_notifier(&masq_inet_notifier);
+ mutex_unlock(&masq_mutex);
+ return ret;
+
+err_unregister:
+ unregister_netdevice_notifier(&masq_dev_notifier);
+err_dec:
+ masq_refcnt--;
+out_unlock:
+ mutex_unlock(&masq_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier);
void nf_nat_masquerade_ipv6_unregister_notifier(void)
{
+ mutex_lock(&masq_mutex);
/* check if the notifier still has clients */
- if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
- return;
+ if (--masq_refcnt > 0)
+ goto out_unlock;
- unregister_inet6addr_notifier(&masq_inet_notifier);
+ unregister_inet6addr_notifier(&masq_inet6_notifier);
unregister_netdevice_notifier(&masq_dev_notifier);
+out_unlock:
+ mutex_unlock(&masq_mutex);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
deleted file mode 100644
index d9bf42ba44fa..000000000000
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6
- * NAT funded by Astaro.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/icmpv6.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool
-icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
- ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
-}
-
-static void
-icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u16 id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
-
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xffff;
-
- for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
- (id % range_size));
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
-}
-
-static bool
-icmpv6_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct icmp6hdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct icmp6hdr *)(skb->data + hdroff);
- l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
- tuple, maniptype);
- if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
- hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
- inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
- hdr->icmp6_identifier,
- tuple->src.u.icmp.id, false);
- hdr->icmp6_identifier = tuple->src.u.icmp.id;
- }
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = {
- .l4proto = IPPROTO_ICMPV6,
- .manip_pkt = icmpv6_manip_pkt,
- .in_range = icmpv6_in_range,
- .unique_tuple = icmpv6_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 24858402e374..b9c8a763c863 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
{
+ struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
struct tcphdr _otcph;
const struct tcphdr *otcph;
@@ -197,15 +198,18 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- if (oldskb->nf_bridge) {
+ br_indev = nf_bridge_get_physindev(oldskb);
+ if (br_indev) {
struct ethhdr *oeth = eth_hdr(oldskb);
- nskb->dev = nf_bridge_get_physindev(oldskb);
+ nskb->dev = br_indev;
nskb->protocol = htons(ETH_P_IPV6);
ip6h->payload_len = htons(sizeof(struct tcphdr));
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
- oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ oeth->h_source, oeth->h_dest, nskb->len) < 0) {
+ kfree_skb(nskb);
return;
+ }
dev_queue_xmit(nskb);
} else
#endif
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index dd0122f3cffe..e06c82e9dfcd 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -70,7 +70,9 @@ static int __init nft_masq_ipv6_module_init(void)
if (ret < 0)
return ret;
- nf_nat_masquerade_ipv6_register_notifier();
+ ret = nf_nat_masquerade_ipv6_register_notifier();
+ if (ret)
+ nft_unregister_expr(&nft_masq_ipv6_type);
return ret;
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5e0efd3954e9..5a426226c762 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
continue;
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif)
+ if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif))
continue;
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
@@ -658,6 +657,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->ip_summed = CHECKSUM_NONE;
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
+
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c3c92713096..a5bb59ee50ac 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
{
struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
struct sk_buff *fp, *head = fq->q.fragments;
- int payload_len;
+ int payload_len, delta;
unsigned int nhoff;
int sum_truesize;
u8 ecn;
@@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
if (payload_len > IPV6_MAXPLEN)
goto out_oversize;
+ delta = - head->truesize;
+
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
goto out_oom;
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(fq->q.net, delta);
+
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
@@ -378,6 +384,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
kfree_skb_partial(fp, headstolen);
} else {
+ fp->sk = NULL;
if (!skb_shinfo(head)->frag_list)
skb_shinfo(head)->frag_list = fp;
head->data_len += fp->len;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 059f0531f7c1..194bc162866d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2977,7 +2977,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!rt)
goto out;
- rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+ rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
+ extack);
if (IS_ERR(rt->fib6_metrics)) {
err = PTR_ERR(rt->fib6_metrics);
/* Do not leave garbage there. */
@@ -3710,7 +3711,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
if (!f6i)
return ERR_PTR(-ENOMEM);
- f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
+ f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
f6i->dst_nocount = true;
f6i->dst_host = true;
f6i->fib6_protocol = RTPROT_KERNEL;
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index a8854dd3e9c5..8181ee7e1e27 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct flowi6 fl6;
+ memset(&fl6, 0, sizeof(fl6));
fl6.daddr = hdr->daddr;
fl6.saddr = hdr->saddr;
fl6.flowlabel = ip6_flowinfo(hdr);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a2bc53..b81eb7cb815e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
}
}
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
fatal = icmpv6_err_convert(type, code, &err);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq, fatal);
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, fatal);
+ return 0;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
@@ -734,6 +737,7 @@ static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
{
+ bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
struct inet_request_sock *ireq = inet_rsk(req);
const struct ipv6_pinfo *np = inet6_sk(sk_listener);
@@ -741,7 +745,7 @@ static void tcp_v6_init_req(struct request_sock *req,
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
/* So that link locals have meaning */
- if (!sk_listener->sk_bound_dev_if &&
+ if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index e72947c99454..3179c425d7ff 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -9,14 +9,15 @@
*
* TCPv6 GSO/GRO support
*/
+#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
-static struct sk_buff *tcp6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
@@ -29,7 +30,7 @@ static struct sk_buff *tcp6_gro_receive(struct list_head *head,
return tcp_gro_receive(head, skb);
}
-static int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index dae25cad05cd..1991dede7367 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -134,24 +134,28 @@ drop:
return 0;
}
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
-static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel46_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol tunnel6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..9cbf363172bd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -45,6 +45,7 @@
#include <net/raw.h>
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
#include <net/xfrm.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
@@ -117,12 +118,16 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
sk->sk_family != PF_INET6)
return -1;
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
score = 0;
inet = inet_sk(sk);
@@ -132,27 +137,16 @@ static int compute_score(struct sock *sk, struct net *net,
score++;
}
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
- score++;
- }
-
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
return -1;
score++;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif);
+ if (!dev_match)
+ return -1;
+ score++;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -200,66 +194,32 @@ struct sock *__udp6_lib_lookup(struct net *net,
int dif, int sdif, struct udp_table *udptable,
struct sk_buff *skb)
{
- struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
+ struct sock *result;
bool exact_dif = udp6_lib_exact_dif_match(net, skb);
- int score, badness;
- u32 hash = 0;
- if (hslot->count > 10) {
- hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif, exact_dif,
+ hslot2, skb);
+ if (!result) {
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
+
hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif,
- hslot2, skb);
- if (!result) {
- unsigned int old_slot2 = slot2;
- hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
- slot2 = hash2 & udptable->mask;
- /* avoid searching the same slot again. */
- if (unlikely(slot2 == old_slot2))
- return result;
-
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2,
- skb);
- }
- if (unlikely(IS_ERR(result)))
- return NULL;
- return result;
- }
-begin:
- result = NULL;
- badness = -1;
- sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
- sdif, exact_dif);
- if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (unlikely(IS_ERR(result)))
- return NULL;
- if (result)
- return result;
- }
- result = sk;
- badness = score;
- }
+ &in6addr_any, hnum, dif, sdif,
+ exact_dif, hslot2,
+ skb);
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -329,6 +289,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
+ struct udp_mib *mib;
int is_udp4;
if (flags & MSG_ERRQUEUE)
@@ -352,6 +313,7 @@ try_again:
msg->msg_flags |= MSG_TRUNC;
is_udp4 = (skb->protocol == htons(ETH_P_IP));
+ mib = __UDPX_MIB(sk, is_udp4);
/*
* If checksum is needed at all, try to do it while copying the
@@ -380,24 +342,13 @@ try_again:
if (unlikely(err)) {
if (!peeked) {
atomic_inc(&sk->sk_drops);
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
return err;
}
- if (!peeked) {
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- }
+ if (!peeked)
+ SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
sock_recv_ts_and_drops(msg, sk, skb);
@@ -421,6 +372,9 @@ try_again:
*addr_len = sizeof(*sin6);
}
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (np->rxopt.all)
ip6_datagram_recv_common_ctl(sk, msg, skb);
@@ -443,17 +397,8 @@ try_again:
csum_copy_err:
if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
udp_skb_destructor)) {
- if (is_udp4) {
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- } else {
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- }
+ SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
@@ -463,15 +408,106 @@ csum_copy_err:
goto try_again;
}
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info,
- struct udp_table *udptable)
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+ static_branch_inc(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info);
+
+ if (!ip6tun_encaps[i])
+ continue;
+ handler = rcu_dereference(ip6tun_encaps[i]->err_handler);
+ if (handler && !handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+ const struct ipv6hdr *hdr, int offset,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, __be32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv6 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, offset);
+
+ sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+ &hdr->saddr, uh->dest,
+ inet6_iif(skb), 0, udptable, skb);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk) {
+ sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+ offset, info));
+ }
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info,
+ struct udp_table *udptable)
{
struct ipv6_pinfo *np;
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct in6_addr *saddr = &hdr->saddr;
const struct in6_addr *daddr = &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -480,9 +516,23 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, skb);
if (!sk) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return;
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+ udptable, skb,
+ opt, type, code, info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
harderr = icmpv6_err_convert(type, code, &err);
@@ -496,10 +546,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
harderr = 1;
}
if (type == NDISC_REDIRECT) {
- ip6_sk_redirect(skb, sk);
+ if (tunnel) {
+ ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+ sk->sk_mark, sk->sk_uid);
+ } else {
+ ip6_sk_redirect(skb, sk);
+ }
goto out;
}
+ /* Tunnels don't have an application socket: don't pass errors back */
+ if (tunnel)
+ goto out;
+
if (!np->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -510,7 +569,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -541,21 +600,14 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static __inline__ void udpv6_err(struct sk_buff *skb,
- struct inet6_skb_parm *opt, u8 type,
- u8 code, int offset, __be32 info)
+static __inline__ int udpv6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt, u8 type,
+ u8 code, int offset, __be32 info)
{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+ return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
}
-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
-{
- static_branch_enable(&udpv6_encap_needed_key);
-}
-EXPORT_SYMBOL(udpv6_encap_enable);
-
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -638,10 +690,32 @@ drop:
return -1;
}
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udpv6_queue_rcv_one_skb(sk, skb);
+
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, false);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ ret = udpv6_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+ true);
+ }
+ return 0;
+}
+
static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
@@ -653,7 +727,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+ !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -687,6 +761,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+ int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
@@ -701,7 +776,8 @@ start_lookup:
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif,
+ hnum))
continue;
/* If zero checksum and no_check is not on for
* the socket then skip it.
@@ -1458,11 +1534,15 @@ void udpv6_destroy_sock(struct sock *sk)
udp_v6_flush_pending_frames(sk);
release_sock(sk);
- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_dec(&udpv6_encap_needed_key);
}
inet6_destroy_sock(sk);
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 7903e21c178b..5730e6503cb4 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,8 +9,8 @@
#include <net/transp_v6.h>
int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
- __be32, struct udp_table *);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+ __be32, struct udp_table *);
int udp_v6_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..83b11d0ac091 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -11,6 +11,7 @@
*/
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <net/ipv6.h>
#include <net/udp.h>
@@ -114,8 +115,8 @@ out:
return segs;
}
-static struct sk_buff *udp6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
@@ -142,18 +143,14 @@ flush:
return NULL;
}
-static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5000ad6878e6..a125aebc29e5 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb)
return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplitev6_err(struct sk_buff *skb,
+static int udplitev6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ &udplite_table);
}
static const struct inet6_protocol udplitev6_protocol = {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 9ef490dddcea..a52cb3fc6df5 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -86,14 +86,16 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
{
struct net *net = dev_net(skb->dev);
struct xfrm_state *x = NULL;
+ struct sec_path *sp;
int i = 0;
- if (secpath_set(skb)) {
+ sp = secpath_set(skb);
+ if (!sp) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
goto drop;
}
- if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
+ if (1 + sp->len == XFRM_MAX_DEPTH) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
}
@@ -145,7 +147,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
goto drop;
}
- skb->sp->xvec[skb->sp->len++] = x;
+ sp->xvec[sp->len++] = x;
spin_lock(&x->lock);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index d35bcf92969c..769f8f78d3b8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
if (xdst->u.rt6.rt6i_idev->dev == dev) {
struct inet6_dev *loopback_idev =
in6_dev_get(dev_net(dev)->loopback_dev);
- BUG_ON(!loopback_idev);
do {
in6_dev_put(xdst->u.rt6.rt6i_idev);
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index b2dc8ce49378..cc979b702c89 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(esp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ah_rcv(struct sk_buff *skb)
@@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ah6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
@@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ipcomp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol esp6_protocol = {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 4a46df8441c9..f5b4febeaa25 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -144,6 +144,9 @@ static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
index = __xfrm6_tunnel_spi_check(net, spi);
if (index >= 0)
goto alloc_spi;
+
+ if (spi == XFRM6_TUNNEL_SPI_MAX)
+ break;
}
for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
index = __xfrm6_tunnel_spi_check(net, spi);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 0bed4cc20603..78ea5a739d10 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1873,30 +1873,26 @@ static void iucv_callback_txdone(struct iucv_path *path,
struct sock *sk = path->private;
struct sk_buff *this = NULL;
struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q;
- struct sk_buff *list_skb = list->next;
+ struct sk_buff *list_skb;
unsigned long flags;
bh_lock_sock(sk);
- if (!skb_queue_empty(list)) {
- spin_lock_irqsave(&list->lock, flags);
- while (list_skb != (struct sk_buff *)list) {
- if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
- this = list_skb;
- break;
- }
- list_skb = list_skb->next;
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk(list, list_skb) {
+ if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
+ this = list_skb;
+ break;
}
- if (this)
- __skb_unlink(this, list);
-
- spin_unlock_irqrestore(&list->lock, flags);
+ }
+ if (this)
+ __skb_unlink(this, list);
+ spin_unlock_irqrestore(&list->lock, flags);
- if (this) {
- kfree_skb(this);
- /* wake up any process waiting for sending */
- iucv_sock_wake_msglim(sk);
- }
+ if (this) {
+ kfree_skb(this);
+ /* wake up any process waiting for sending */
+ iucv_sock_wake_msglim(sk);
}
if (sk->sk_state == IUCV_CLOSING) {
@@ -2284,11 +2280,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
list = &iucv->send_skb_q;
spin_lock_irqsave(&list->lock, flags);
- if (skb_queue_empty(list))
- goto out_unlock;
- list_skb = list->next;
- nskb = list_skb->next;
- while (list_skb != (struct sk_buff *)list) {
+ skb_queue_walk_safe(list, list_skb, nskb) {
if (skb_shinfo(list_skb) == skb_shinfo(skb)) {
switch (n) {
case TX_NOTIFY_OK:
@@ -2321,10 +2313,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
}
break;
}
- list_skb = nskb;
- nskb = nskb->next;
}
-out_unlock:
spin_unlock_irqrestore(&list->lock, flags);
if (sk->sk_state == IUCV_CLOSING) {
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9d61266526e7..655c787f9d54 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2020,7 +2020,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
{
- struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
if (xfrm_ctx) {
int len = sizeof(struct sadb_x_sec_ctx);
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8da86ceca33d..309dee76724e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -47,6 +47,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
/**
+ * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
+ * device
+ * @net: network namespace for device index lookup
+ * @ifindex: targeted interface
+ */
+int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ while (dev && !netif_is_l3_master(dev))
+ dev = netdev_master_upper_dev_get(dev);
+
+ return dev ? dev->ifindex : 0;
+}
+EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
+
+/**
* l3mdev_fib_table - get FIB table id associated with an L3
* master interface
* @dev: targeted interface
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index f869e35d0974..be471fe95048 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -57,14 +57,13 @@ comment "Some wireless drivers require a rate control algorithm"
depends on MAC80211 && MAC80211_HAS_RC=n
config MAC80211_MESH
- bool "Enable mac80211 mesh networking (pre-802.11s) support"
+ bool "Enable mac80211 mesh networking support"
depends on MAC80211
---help---
- This options enables support of Draft 802.11s mesh networking.
- The implementation is based on Draft 2.08 of the Mesh Networking
- amendment. However, no compliance with that draft is claimed or even
- possible, as drafts leave a number of identifiers to be defined after
- ratification. For more information visit http://o11s.org/.
+ Select this option to enable 802.11 mesh operation in mac80211
+ drivers that support it. 802.11 mesh connects multiple stations
+ over (possibly multi-hop) wireless links to form a single logical
+ LAN.
config MAC80211_LEDS
bool "Enable LED triggers"
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 51622333d460..de65fe3ed9cc 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -800,8 +800,8 @@ static int ieee80211_set_ftm_responder_params(
u8 *pos;
int len;
- if ((!lci || !lci_len) && (!civicloc || !civicloc_len))
- return 1;
+ if (!lci_len && !civicloc_len)
+ return 0;
bss_conf = &sdata->vif.bss_conf;
old = bss_conf->ftmr_params;
@@ -2028,6 +2028,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
nconf->dot11MeshAwakeWindowDuration;
if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask))
conf->plink_timeout = nconf->plink_timeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask))
+ conf->dot11MeshConnectedToMeshGate =
+ nconf->dot11MeshConnectedToMeshGate;
ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON);
return 0;
}
@@ -2891,7 +2894,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
beacon->proberesp_ies_len + beacon->assocresp_ies_len +
- beacon->probe_resp_len;
+ beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len;
new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
if (!new_beacon)
@@ -2934,8 +2937,9 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
pos += beacon->probe_resp_len;
}
- if (beacon->ftm_responder)
- new_beacon->ftm_responder = beacon->ftm_responder;
+
+ /* might copy -1, meaning no changes requested */
+ new_beacon->ftm_responder = beacon->ftm_responder;
if (beacon->lci) {
new_beacon->lci_len = beacon->lci_len;
new_beacon->lci = pos;
@@ -3849,6 +3853,26 @@ ieee80211_get_ftm_responder_stats(struct wiphy *wiphy,
return drv_get_ftm_responder_stats(local, sdata, ftm_stats);
}
+static int
+ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev,
+ struct cfg80211_pmsr_request *request)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev);
+
+ return drv_start_pmsr(local, sdata, request);
+}
+
+static void
+ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev,
+ struct cfg80211_pmsr_request *request)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev);
+
+ return drv_abort_pmsr(local, sdata, request);
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3944,4 +3968,6 @@ const struct cfg80211_ops mac80211_config_ops = {
.tx_control_port = ieee80211_tx_control_port,
.get_txq_stats = ieee80211_get_txq_stats,
.get_ftm_responder_stats = ieee80211_get_ftm_responder_stats,
+ .start_pmsr = ieee80211_start_pmsr,
+ .abort_pmsr = ieee80211_abort_pmsr,
};
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c813207bb123..cff0fb3578c9 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -641,6 +641,8 @@ IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval,
IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC);
IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration,
u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC);
+IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate,
+ u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC);
#endif
#define DEBUGFS_ADD_MODE(name, mode) \
@@ -762,6 +764,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval);
MESHPARAMS_ADD(power_mode);
MESHPARAMS_ADD(dot11MeshAwakeWindowDuration);
+ MESHPARAMS_ADD(dot11MeshConnectedToMeshGate);
#undef MESHPARAMS_ADD
}
#endif
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index af5185a836e5..b753194710ad 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -795,22 +795,22 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
#define PRINT_NSS_SUPP(f, n) \
do { \
- int i; \
+ int _i; \
u16 v = le16_to_cpu(nss->f); \
p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \
- for (i = 0; i < 8; i += 2) { \
- switch ((v >> i) & 0x3) { \
+ for (_i = 0; _i < 8; _i += 2) { \
+ switch ((v >> _i) & 0x3) { \
case 0: \
- PRINT(n "-%d-SUPPORT-0-7", i / 2); \
+ PRINT(n "-%d-SUPPORT-0-7", _i / 2); \
break; \
case 1: \
- PRINT(n "-%d-SUPPORT-0-9", i / 2); \
+ PRINT(n "-%d-SUPPORT-0-9", _i / 2); \
break; \
case 2: \
- PRINT(n "-%d-SUPPORT-0-11", i / 2); \
+ PRINT(n "-%d-SUPPORT-0-11", _i / 2); \
break; \
case 3: \
- PRINT(n "-%d-NOT-SUPPORTED", i / 2); \
+ PRINT(n "-%d-NOT-SUPPORTED", _i / 2); \
break; \
} \
} \
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 0b1747a2313d..3e0d5922a440 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1199,6 +1199,40 @@ drv_get_ftm_responder_stats(struct ieee80211_local *local,
return ret;
}
+static inline int drv_start_pmsr(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_pmsr_request *request)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_start_pmsr(local, sdata);
+
+ if (local->ops->start_pmsr)
+ ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_abort_pmsr(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_pmsr_request *request)
+{
+ trace_drv_abort_pmsr(local, sdata);
+
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ if (local->ops->abort_pmsr)
+ local->ops->abort_pmsr(&local->hw, &sdata->vif, request);
+ trace_drv_return_void(local);
+}
+
static inline int drv_start_nan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct cfg80211_nan_conf *conf)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 10a05062e4a0..7dfb4e2f98b2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -500,6 +500,7 @@ struct ieee80211_if_managed {
unsigned int uapsd_max_sp_len;
int wmm_last_param_set;
+ int mu_edca_last_param_set;
u8 use_4addr;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 5836ddeac9e3..4a6ff1482a9f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -7,6 +7,7 @@
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (c) 2016 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -1015,6 +1016,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
if (local->open_count == 0)
ieee80211_clear_tx_pending(local);
+ sdata->vif.bss_conf.beacon_int = 0;
+
/*
* If the interface goes down while suspended, presumably because
* the device was unplugged and that happens before our resume,
@@ -1799,7 +1802,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
}
ieee80211_assign_perm_addr(local, ndev->perm_addr, type);
- if (params && is_valid_ether_addr(params->macaddr))
+ if (is_valid_ether_addr(params->macaddr))
memcpy(ndev->dev_addr, params->macaddr, ETH_ALEN);
else
memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN);
@@ -1868,11 +1871,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ieee80211_setup_sdata(sdata, type);
if (ndev) {
- if (params) {
- ndev->ieee80211_ptr->use_4addr = params->use_4addr;
- if (type == NL80211_IFTYPE_STATION)
- sdata->u.mgd.use_4addr = params->use_4addr;
- }
+ ndev->ieee80211_ptr->use_4addr = params->use_4addr;
+ if (type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.use_4addr = params->use_4addr;
ndev->features |= local->hw.netdev_features;
@@ -1949,6 +1950,8 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
WARN(local->open_count, "%s: open count remains %d\n",
wiphy_name(local->hw.wiphy), local->open_count);
+ ieee80211_txq_teardown_flows(local);
+
mutex_lock(&local->iflist_mtx);
list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
list_del(&sdata->list);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 83e71e6b2ebe..87a729926734 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1221,8 +1221,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* add one default STA interface if supported */
if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
!ieee80211_hw_check(hw, NO_AUTO_VIF)) {
+ struct vif_params params = {0};
+
result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL,
- NL80211_IFTYPE_STATION, NULL);
+ NL80211_IFTYPE_STATION, &params);
if (result)
wiphy_warn(local->hw.wiphy,
"Failed to add default virtual iface\n");
@@ -1262,7 +1264,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
rtnl_unlock();
ieee80211_led_exit(local);
ieee80211_wep_free(local);
- ieee80211_txq_teardown_flows(local);
fail_flows:
destroy_workqueue(local->workqueue);
fail_workqueue:
@@ -1288,7 +1289,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
#if IS_ENABLED(CONFIG_IPV6)
unregister_inet6addr_notifier(&local->ifa6_notifier);
#endif
- ieee80211_txq_teardown_flows(local);
rtnl_lock();
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 8bad414c52ad..c90452aa0c42 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -254,6 +254,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
u8 *pos, neighbors;
u8 meshconf_len = sizeof(struct ieee80211_meshconf_ie);
+ bool is_connected_to_gate = ifmsh->num_gates > 0 ||
+ ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol ||
+ ifmsh->mshcfg.dot11MeshConnectedToMeshGate;
if (skb_tailroom(skb) < 2 + meshconf_len)
return -ENOMEM;
@@ -278,7 +281,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
/* Mesh Formation Info - number of neighbors */
neighbors = atomic_read(&ifmsh->estab_plinks);
neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS);
- *pos++ = neighbors << 1;
+ *pos++ = (neighbors << 1) | is_connected_to_gate;
/* Mesh capability */
*pos = 0x00;
*pos |= ifmsh->mshcfg.dot11MeshForwarding ?
@@ -1191,7 +1194,8 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
if (!sdata->u.mesh.user_mpm ||
sdata->u.mesh.mshcfg.rssi_threshold == 0 ||
sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal)
- mesh_neighbour_update(sdata, mgmt->sa, &elems);
+ mesh_neighbour_update(sdata, mgmt->sa, &elems,
+ rx_status);
}
if (ifmsh->sync_ops)
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 21526630bf65..cad6592c52a1 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -273,7 +273,8 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata);
/* Mesh plinks */
void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
- u8 *hw_addr, struct ieee802_11_elems *ie);
+ u8 *hw_addr, struct ieee802_11_elems *ie,
+ struct ieee80211_rx_status *rx_status);
bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie);
u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata);
void mesh_plink_timer(struct timer_list *t);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 5b5b0f95ffd1..33055c8ed37e 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -513,7 +513,8 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr)
static struct sta_info *
mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
- struct ieee802_11_elems *elems)
+ struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status)
{
struct sta_info *sta = NULL;
@@ -521,11 +522,17 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
if (sdata->u.mesh.user_mpm ||
sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
if (mesh_peer_accepts_plinks(elems) &&
- mesh_plink_availables(sdata))
+ mesh_plink_availables(sdata)) {
+ int sig = 0;
+
+ if (ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM))
+ sig = rx_status->signal;
+
cfg80211_notify_new_peer_candidate(sdata->dev, addr,
elems->ie_start,
elems->total_len,
- GFP_KERNEL);
+ sig, GFP_KERNEL);
+ }
} else
sta = __mesh_sta_info_alloc(sdata, addr);
@@ -538,13 +545,15 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
* @sdata: local meshif
* @addr: peer's address
* @elems: IEs from beacon or mesh peering frame.
+ * @rx_status: rx status for the frame for signal reporting
*
* Return existing or newly allocated sta_info under RCU read lock.
* (re)initialize with given IEs.
*/
static struct sta_info *
mesh_sta_info_get(struct ieee80211_sub_if_data *sdata,
- u8 *addr, struct ieee802_11_elems *elems) __acquires(RCU)
+ u8 *addr, struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status) __acquires(RCU)
{
struct sta_info *sta = NULL;
@@ -555,7 +564,7 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata,
} else {
rcu_read_unlock();
/* can't run atomic */
- sta = mesh_sta_info_alloc(sdata, addr, elems);
+ sta = mesh_sta_info_alloc(sdata, addr, elems, rx_status);
if (!sta) {
rcu_read_lock();
return NULL;
@@ -576,20 +585,25 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata,
* @sdata: local meshif
* @addr: peer's address
* @elems: IEs from beacon or mesh peering frame
+ * @rx_status: rx status for the frame for signal reporting
*
* Initiates peering if appropriate.
*/
void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
u8 *hw_addr,
- struct ieee802_11_elems *elems)
+ struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status)
{
struct sta_info *sta;
u32 changed = 0;
- sta = mesh_sta_info_get(sdata, hw_addr, elems);
+ sta = mesh_sta_info_get(sdata, hw_addr, elems, rx_status);
if (!sta)
goto out;
+ sta->mesh->connected_to_gate = elems->mesh_config->meshconf_form &
+ IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE;
+
if (mesh_peer_accepts_plinks(elems) &&
sta->mesh->plink_state == NL80211_PLINK_LISTEN &&
sdata->u.mesh.accepting_plinks &&
@@ -1069,7 +1083,8 @@ out:
static void
mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
- struct ieee802_11_elems *elems)
+ struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status)
{
struct sta_info *sta;
@@ -1134,7 +1149,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata,
if (event == OPN_ACPT) {
rcu_read_unlock();
/* allocate sta entry if necessary and update info */
- sta = mesh_sta_info_get(sdata, mgmt->sa, elems);
+ sta = mesh_sta_info_get(sdata, mgmt->sa, elems, rx_status);
if (!sta) {
mpl_dbg(sdata, "Mesh plink: failed to init peer!\n");
goto unlock_rcu;
@@ -1200,5 +1215,5 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
return;
}
ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems);
- mesh_process_plink_frame(sdata, mgmt, &elems);
+ mesh_process_plink_frame(sdata, mgmt, &elems, rx_status);
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d2bc8d57c87e..687821567287 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -916,6 +916,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
ieee80211_add_vht_ie(sdata, skb, sband,
&assoc_data->ap_vht_cap);
+ /*
+ * If AP doesn't support HT, mark HE as disabled.
+ * If on the 5GHz band, make sure it supports VHT.
+ */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HT ||
+ (sband->band == NL80211_BAND_5GHZ &&
+ ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
ieee80211_add_he_ie(sdata, skb, sband);
@@ -1869,7 +1878,7 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local,
struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
size_t left;
- int count, ac;
+ int count, mu_edca_count, ac;
const u8 *pos;
u8 uapsd_queues = 0;
@@ -1889,9 +1898,16 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local,
uapsd_queues = ifmgd->uapsd_queues;
count = wmm_param[6] & 0x0f;
- if (count == ifmgd->wmm_last_param_set)
+ /* -1 is the initial value of ifmgd->mu_edca_last_param_set.
+ * if mu_edca was preset before and now it disappeared tell
+ * the driver about it.
+ */
+ mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1;
+ if (count == ifmgd->wmm_last_param_set &&
+ mu_edca_count == ifmgd->mu_edca_last_param_set)
return false;
ifmgd->wmm_last_param_set = count;
+ ifmgd->mu_edca_last_param_set = mu_edca_count;
pos = wmm_param + 8;
left = wmm_param_len - 8;
@@ -2766,6 +2782,7 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct sta_info *sta;
+ bool result = true;
sdata_info(sdata, "authenticated\n");
ifmgd->auth_data->done = true;
@@ -2778,15 +2795,18 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
sta = sta_info_get(sdata, bssid);
if (!sta) {
WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
- return false;
+ result = false;
+ goto out;
}
if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
sdata_info(sdata, "failed moving %pM to auth\n", bssid);
- return false;
+ result = false;
+ goto out;
}
- mutex_unlock(&sdata->local->sta_mtx);
- return true;
+out:
+ mutex_unlock(&sdata->local->sta_mtx);
+ return result;
}
static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
@@ -3058,6 +3078,19 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
}
}
+static bool ieee80211_twt_req_supported(const struct sta_info *sta,
+ const struct ieee802_11_elems *elems)
+{
+ if (elems->ext_capab_len < 10)
+ return false;
+
+ if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT))
+ return false;
+
+ return sta->sta.he_cap.he_cap_elem.mac_cap_info[0] &
+ IEEE80211_HE_MAC_CAP0_TWT_RES;
+}
+
static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss,
struct ieee80211_mgmt *mgmt, size_t len)
@@ -3211,16 +3244,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out;
}
- /*
- * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark
- * HE as disabled. If on the 5GHz band, make sure it supports VHT.
- */
- if (ifmgd->flags & IEEE80211_STA_DISABLE_HT ||
- (sband->band == NL80211_BAND_5GHZ &&
- ifmgd->flags & IEEE80211_STA_DISABLE_VHT) ||
- (!elems.he_cap && !elems.he_operation))
- ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
-
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
(!elems.he_cap || !elems.he_operation)) {
mutex_unlock(&sdata->local->sta_mtx);
@@ -3247,8 +3270,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta);
bss_conf->he_support = sta->sta.he_cap.has_he;
+ bss_conf->twt_requester =
+ ieee80211_twt_req_supported(sta, &elems);
} else {
bss_conf->he_support = false;
+ bss_conf->twt_requester = false;
}
if (bss_conf->he_support) {
@@ -3333,6 +3359,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* 4-bit value.
*/
ifmgd->wmm_last_param_set = -1;
+ ifmgd->mu_edca_last_param_set = -1;
if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
ieee80211_set_wmm_default(sdata, false, false);
@@ -4656,8 +4683,10 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
- if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
- ieee80211_get_he_sta_cap(sband)) {
+ if (!ieee80211_get_he_sta_cap(sband))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) {
const struct cfg80211_bss_ies *ies;
const u8 *he_oper_ie;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3bd3b5769797..45aad3d3108c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -143,6 +143,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
/* allocate extra bitmaps */
if (status->chains)
len += 4 * hweight8(status->chains);
+ /* vendor presence bitmap */
+ if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)
+ len += 4;
if (ieee80211_have_rx_timestamp(status)) {
len = ALIGN(len, 8);
@@ -207,8 +210,6 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
struct ieee80211_vendor_radiotap *rtap = (void *)skb->data;
- /* vendor presence bitmap */
- len += 4;
/* alignment for fixed 6-byte vendor data header */
len = ALIGN(len, 2);
/* vendor data header */
@@ -753,6 +754,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
struct ieee80211_sub_if_data *monitor_sdata =
rcu_dereference(local->monitor_sdata);
bool only_monitor = false;
+ unsigned int min_head_len;
if (status->flag & RX_FLAG_RADIOTAP_HE)
rtap_space += sizeof(struct ieee80211_radiotap_he);
@@ -760,12 +762,18 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
rtap_space += sizeof(struct ieee80211_radiotap_he_mu);
+ if (status->flag & RX_FLAG_RADIOTAP_LSIG)
+ rtap_space += sizeof(struct ieee80211_radiotap_lsig);
+
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
- struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
+ struct ieee80211_vendor_radiotap *rtap =
+ (void *)(origskb->data + rtap_space);
rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
}
+ min_head_len = rtap_space;
+
/*
* First, we may need to make a copy of the skb because
* (1) we need to modify it for radiotap (if not present), and
@@ -775,18 +783,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
* the SKB because it has a bad FCS/PLCP checksum.
*/
- if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) {
- if (unlikely(origskb->len <= FCS_LEN)) {
- /* driver bug */
- WARN_ON(1);
- dev_kfree_skb(origskb);
- return NULL;
+ if (!(status->flag & RX_FLAG_NO_PSDU)) {
+ if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) {
+ if (unlikely(origskb->len <= FCS_LEN + rtap_space)) {
+ /* driver bug */
+ WARN_ON(1);
+ dev_kfree_skb(origskb);
+ return NULL;
+ }
+ present_fcs_len = FCS_LEN;
}
- present_fcs_len = FCS_LEN;
+
+ /* also consider the hdr->frame_control */
+ min_head_len += 2;
}
- /* ensure hdr->frame_control and vendor radiotap data are in skb head */
- if (!pskb_may_pull(origskb, 2 + rtap_space)) {
+ /* ensure that the expected data elements are in skb head */
+ if (!pskb_may_pull(origskb, min_head_len)) {
dev_kfree_skb(origskb);
return NULL;
}
@@ -1403,6 +1416,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (ieee80211_is_ctl(hdr->frame_control) ||
+ ieee80211_is_nullfunc(hdr->frame_control) ||
ieee80211_is_qos_nullfunc(hdr->frame_control) ||
is_multicast_ether_addr(hdr->addr1))
return RX_CONTINUE;
@@ -3063,7 +3077,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
cfg80211_sta_opmode_change_notify(sdata->dev,
rx->sta->addr,
&sta_opmode,
- GFP_KERNEL);
+ GFP_ATOMIC);
goto handled;
}
case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
@@ -3100,7 +3114,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
cfg80211_sta_opmode_change_notify(sdata->dev,
rx->sta->addr,
&sta_opmode,
- GFP_KERNEL);
+ GFP_ATOMIC);
goto handled;
}
default:
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 5d2a11777718..95413413f98c 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -356,7 +356,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
{
struct ieee80211_local *local = hw_to_local(hw);
- bool hw_scan = local->ops->hw_scan;
+ bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
bool was_scanning = local->scanning;
struct cfg80211_scan_request *scan_req;
struct ieee80211_sub_if_data *scan_sdata;
@@ -606,6 +606,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
struct cfg80211_scan_request *req)
{
struct ieee80211_local *local = sdata->local;
+ bool hw_scan = local->ops->hw_scan;
int rc;
lockdep_assert_held(&local->mtx);
@@ -620,7 +621,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
return 0;
}
- if (local->ops->hw_scan) {
+ again:
+ if (hw_scan) {
u8 *ies;
local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len;
@@ -679,7 +681,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
else
memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN);
- if (local->ops->hw_scan) {
+ if (hw_scan) {
__set_bit(SCAN_HW_SCANNING, &local->scanning);
} else if ((req->n_channels == 1) &&
(req->channels[0] == local->_oper_chandef.chan)) {
@@ -722,7 +724,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
ieee80211_recalc_idle(local);
- if (local->ops->hw_scan) {
+ if (hw_scan) {
WARN_ON(!ieee80211_prep_hw_scan(local));
rc = drv_hw_scan(local, sdata, local->hw_scan_req);
} else {
@@ -740,6 +742,18 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
RCU_INIT_POINTER(local->scan_sdata, NULL);
}
+ if (hw_scan && rc == 1) {
+ /*
+ * we can't fall back to software for P2P-GO
+ * as it must update NoA etc.
+ */
+ if (ieee80211_vif_type_p2p(&sdata->vif) ==
+ NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+ hw_scan = false;
+ goto again;
+ }
+
return rc;
}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index fb8c2252ac0e..c4a8f115ed33 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2253,11 +2253,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
- for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
- struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
-
- sta_set_tidstats(sta, tidstats, i);
- }
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
+ sta_set_tidstats(sta, &sinfo->pertid[i], i);
}
if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -2267,7 +2264,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
BIT_ULL(NL80211_STA_INFO_PEER_PM) |
- BIT_ULL(NL80211_STA_INFO_NONPEER_PM);
+ BIT_ULL(NL80211_STA_INFO_NONPEER_PM) |
+ BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE);
sinfo->llid = sta->mesh->llid;
sinfo->plid = sta->mesh->plid;
@@ -2279,6 +2277,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
sinfo->local_pm = sta->mesh->local_pm;
sinfo->peer_pm = sta->mesh->peer_pm;
sinfo->nonpeer_pm = sta->mesh->nonpeer_pm;
+ sinfo->connected_to_gate = sta->mesh->connected_to_gate;
#endif
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 9a04327d71d1..8eb29041be54 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -364,6 +364,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8)
* @nonpeer_pm: STA power save mode towards non-peer neighbors
* @processed_beacon: set to true after peer rates and capabilities are
* processed
+ * @connected_to_gate: true if mesh STA has a path to a mesh gate
* @fail_avg: moving percentage of failed MSDUs
*/
struct mesh_sta {
@@ -381,6 +382,7 @@ struct mesh_sta {
u8 plink_retries;
bool processed_beacon;
+ bool connected_to_gate;
enum nl80211_plink_state plink_state;
u32 plink_timeout;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index aa4afbf0abaf..3f0b96e1e02f 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -556,6 +556,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
}
ieee80211_led_tx(local);
+
+ if (skb_has_frag_list(skb)) {
+ kfree_skb_list(skb_shinfo(skb)->frag_list);
+ skb_shinfo(skb)->frag_list = NULL;
+ }
}
/*
@@ -964,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
/* Track when last TDLS packet was ACKed */
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
sta->status_stats.last_tdls_pkt_time = jiffies;
+ } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ return;
} else {
ieee80211_lost_packet(sta, info);
}
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 588c51a67c89..35ea0dcb55e6 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1052,10 +1052,10 @@ TRACE_EVENT(drv_ampdu_action,
);
TRACE_EVENT(drv_get_survey,
- TP_PROTO(struct ieee80211_local *local, int idx,
+ TP_PROTO(struct ieee80211_local *local, int _idx,
struct survey_info *survey),
- TP_ARGS(local, idx, survey),
+ TP_ARGS(local, _idx, survey),
TP_STRUCT__entry(
LOCAL_ENTRY
@@ -1064,7 +1064,7 @@ TRACE_EVENT(drv_get_survey,
TP_fast_assign(
LOCAL_ASSIGN;
- __entry->idx = idx;
+ __entry->idx = _idx;
),
TP_printk(
@@ -1882,6 +1882,18 @@ TRACE_EVENT(drv_del_nan_func,
)
);
+DEFINE_EVENT(local_sdata_evt, drv_start_pmsr,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_abort_pmsr,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
/*
* Tracing for API calls that drivers call.
*/
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e0ccee23fbcd..f170d6c6629a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -439,8 +439,8 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL))
info->hw_queue = tx->sdata->vif.cab_queue;
- /* no stations in PS mode */
- if (!atomic_read(&ps->num_sta_ps))
+ /* no stations in PS mode and no buffered packets */
+ if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf))
return TX_CONTINUE;
info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM;
@@ -3218,6 +3218,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
return false;
+ if (skb_is_gso(skb))
+ return false;
+
if (!txq)
return false;
@@ -3242,7 +3245,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
tin = &txqi->tin;
flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func);
head = skb_peek_tail(&flow->queue);
- if (!head)
+ if (!head || skb_is_gso(head))
goto out;
orig_len = head->len;
@@ -3583,7 +3586,7 @@ begin:
skb_queue_splice_tail(&tx.skbs, &txqi->frags);
}
- if (skb && skb_has_frag_list(skb) &&
+ if (skb_has_frag_list(skb) &&
!ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
if (skb_linearize(skb)) {
ieee80211_free_txskb(&local->hw, skb);
@@ -4579,7 +4582,7 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
IEEE80211_STYPE_NULLFUNC |
IEEE80211_FCTL_TODS);
if (qos) {
- __le16 qos = cpu_to_le16(7);
+ __le16 qoshdr = cpu_to_le16(7);
BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC |
IEEE80211_STYPE_NULLFUNC) !=
@@ -4588,7 +4591,7 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC);
skb->priority = 7;
skb_set_queue_mapping(skb, IEEE80211_AC_VO);
- skb_put_data(skb, &qos, sizeof(qos));
+ skb_put_data(skb, &qoshdr, sizeof(qoshdr));
}
memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index bec424316ea4..d0eb38b890aa 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -299,16 +299,16 @@ out:
spin_unlock_bh(&fq->lock);
}
-void ieee80211_wake_txqs(unsigned long data)
+static void
+__releases(&local->queue_stop_reason_lock)
+__acquires(&local->queue_stop_reason_lock)
+_ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags)
{
- struct ieee80211_local *local = (struct ieee80211_local *)data;
struct ieee80211_sub_if_data *sdata;
int n_acs = IEEE80211_NUM_ACS;
- unsigned long flags;
int i;
rcu_read_lock();
- spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
if (local->hw.queues < IEEE80211_NUM_ACS)
n_acs = 1;
@@ -317,7 +317,7 @@ void ieee80211_wake_txqs(unsigned long data)
if (local->queue_stop_reasons[i])
continue;
- spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags);
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
int ac;
@@ -329,13 +329,22 @@ void ieee80211_wake_txqs(unsigned long data)
__ieee80211_wake_txqs(sdata, ac);
}
}
- spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ spin_lock_irqsave(&local->queue_stop_reason_lock, *flags);
}
- spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
rcu_read_unlock();
}
+void ieee80211_wake_txqs(unsigned long data)
+{
+ struct ieee80211_local *local = (struct ieee80211_local *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ _ieee80211_wake_txqs(local, &flags);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
{
struct ieee80211_sub_if_data *sdata;
@@ -371,7 +380,8 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
enum queue_stop_reason reason,
- bool refcounted)
+ bool refcounted,
+ unsigned long *flags)
{
struct ieee80211_local *local = hw_to_local(hw);
@@ -405,8 +415,19 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
} else
tasklet_schedule(&local->tx_pending_tasklet);
- if (local->ops->wake_tx_queue)
- tasklet_schedule(&local->wake_txqs_tasklet);
+ /*
+ * Calling _ieee80211_wake_txqs here can be a problem because it may
+ * release queue_stop_reason_lock which has been taken by
+ * __ieee80211_wake_queue's caller. It is certainly not very nice to
+ * release someone's lock, but it is fine because all the callers of
+ * __ieee80211_wake_queue call it right before releasing the lock.
+ */
+ if (local->ops->wake_tx_queue) {
+ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER)
+ tasklet_schedule(&local->wake_txqs_tasklet);
+ else
+ _ieee80211_wake_txqs(local, flags);
+ }
}
void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
@@ -417,7 +438,7 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
unsigned long flags;
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
- __ieee80211_wake_queue(hw, queue, reason, refcounted);
+ __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags);
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
@@ -514,7 +535,7 @@ void ieee80211_add_pending_skb(struct ieee80211_local *local,
false);
__skb_queue_tail(&local->pending[queue], skb);
__ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
- false);
+ false, &flags);
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
@@ -547,7 +568,7 @@ void ieee80211_add_pending_skbs(struct ieee80211_local *local,
for (i = 0; i < hw->queues; i++)
__ieee80211_wake_queue(hw, i,
IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
- false);
+ false, &flags);
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
@@ -605,7 +626,7 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
for_each_set_bit(i, &queues, hw->queues)
- __ieee80211_wake_queue(hw, i, reason, refcounted);
+ __ieee80211_wake_queue(hw, i, reason, refcounted, &flags);
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
@@ -1202,6 +1223,8 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA &&
elen >= (sizeof(*elems->mu_edca_param_set) + 1)) {
elems->mu_edca_param_set = (void *)&pos[1];
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
} else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) {
elems->he_cap = (void *)&pos[1];
elems->he_cap_len = elen - 1;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 73e8f347802e..bfe9ed9f4c48 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -30,13 +30,13 @@ int ieee80211_wep_init(struct ieee80211_local *local)
/* start WEP IV from a random value */
get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN);
- local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(local->wep_tx_tfm)) {
local->wep_rx_tfm = ERR_PTR(-EINVAL);
return PTR_ERR(local->wep_tx_tfm);
}
- local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(local->wep_rx_tfm)) {
crypto_free_cipher(local->wep_tx_tfm);
local->wep_tx_tfm = ERR_PTR(-EINVAL);
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 1dae77c54009..87505600dbb2 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -73,10 +73,15 @@ enum {
#define NCSI_OEM_MFR_BCM_ID 0x113d
/* Broadcom specific OEM Command */
#define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */
+/* Mellanox specific OEM Command */
+#define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */
+#define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */
/* OEM Command payload lengths*/
#define NCSI_OEM_BCM_CMD_GMA_LEN 12
+#define NCSI_OEM_MLX_CMD_GMA_LEN 8
/* Mac address offset in OEM response */
#define BCM_MAC_ADDR_OFFSET 28
+#define MLX_MAC_ADDR_OFFSET 8
struct ncsi_channel_version {
@@ -222,6 +227,10 @@ struct ncsi_package {
unsigned int channel_num; /* Number of channels */
struct list_head channels; /* List of chanels */
struct list_head node; /* Form list of packages */
+
+ bool multi_channel; /* Enable multiple channels */
+ u32 channel_whitelist; /* Channels to configure */
+ struct ncsi_channel *preferred_channel; /* Primary channel */
};
struct ncsi_request {
@@ -287,16 +296,16 @@ struct ncsi_dev_priv {
#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */
#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */
#define NCSI_DEV_RESHUFFLE 4
+#define NCSI_DEV_RESET 8 /* Reset state of NC */
unsigned int gma_flag; /* OEM GMA flag */
spinlock_t lock; /* Protect the NCSI device */
#if IS_ENABLED(CONFIG_IPV6)
unsigned int inet6_addr_num; /* Number of IPv6 addresses */
#endif
+ unsigned int package_probe_id;/* Current ID during probe */
unsigned int package_num; /* Number of packages */
struct list_head packages; /* List of packages */
struct ncsi_channel *hot_channel; /* Channel was ever active */
- struct ncsi_package *force_package; /* Force a specific package */
- struct ncsi_channel *force_channel; /* Force a specific channel */
struct ncsi_request requests[256]; /* Request table */
unsigned int request_id; /* Last used request ID */
#define NCSI_REQ_START_IDX 1
@@ -309,6 +318,9 @@ struct ncsi_dev_priv {
struct list_head node; /* Form NCSI device list */
#define NCSI_MAX_VLAN_VIDS 15
struct list_head vlan_vids; /* List of active VLAN IDs */
+
+ bool multi_package; /* Enable multiple packages */
+ u32 package_whitelist; /* Packages to configure */
};
struct ncsi_cmd_arg {
@@ -341,6 +353,7 @@ extern spinlock_t ncsi_dev_lock;
list_for_each_entry_rcu(nc, &np->channels, node)
/* Resources */
+int ncsi_reset_dev(struct ncsi_dev *nd);
void ncsi_start_channel_monitor(struct ncsi_channel *nc);
void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
@@ -361,6 +374,13 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
void ncsi_free_request(struct ncsi_request *nr);
struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
+bool ncsi_channel_has_link(struct ncsi_channel *channel);
+bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
+ struct ncsi_channel *channel);
+int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp,
+ struct ncsi_package *np,
+ struct ncsi_channel *disable,
+ struct ncsi_channel *enable);
/* Packet handlers */
u32 ncsi_calculate_checksum(unsigned char *data, int len);
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 25e483e8278b..26d67e27551f 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -50,13 +50,15 @@ static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h,
static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
struct ncsi_aen_pkt_hdr *h)
{
- struct ncsi_aen_lsc_pkt *lsc;
- struct ncsi_channel *nc;
+ struct ncsi_channel *nc, *tmp;
struct ncsi_channel_mode *ncm;
- bool chained;
- int state;
unsigned long old_data, data;
+ struct ncsi_aen_lsc_pkt *lsc;
+ struct ncsi_package *np;
+ bool had_link, has_link;
unsigned long flags;
+ bool chained;
+ int state;
/* Find the NCSI channel */
ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
@@ -73,6 +75,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
ncm->data[2] = data;
ncm->data[4] = ntohl(lsc->oem_status);
+ had_link = !!(old_data & 0x1);
+ has_link = !!(data & 0x1);
+
netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n",
nc->id, data & 0x1 ? "up" : "down");
@@ -80,22 +85,60 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
state = nc->state;
spin_unlock_irqrestore(&nc->lock, flags);
- if (!((old_data ^ data) & 0x1) || chained)
- return 0;
- if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
- !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
+ if (state == NCSI_CHANNEL_INACTIVE)
+ netdev_warn(ndp->ndev.dev,
+ "NCSI: Inactive channel %u received AEN!\n",
+ nc->id);
+
+ if ((had_link == has_link) || chained)
return 0;
- if (!(ndp->flags & NCSI_DEV_HWA) &&
- state == NCSI_CHANNEL_ACTIVE)
- ndp->flags |= NCSI_DEV_RESHUFFLE;
+ if (!ndp->multi_package && !nc->package->multi_channel) {
+ if (had_link) {
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
+ ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return ncsi_process_next_channel(ndp);
+ }
+ /* Configured channel came up */
+ return 0;
+ }
- ncsi_stop_channel_monitor(nc);
- spin_lock_irqsave(&ndp->lock, flags);
- list_add_tail_rcu(&nc->link, &ndp->channel_queue);
- spin_unlock_irqrestore(&ndp->lock, flags);
+ if (had_link) {
+ ncm = &nc->modes[NCSI_MODE_TX_ENABLE];
+ if (ncsi_channel_is_last(ndp, nc)) {
+ /* No channels left, reconfigure */
+ return ncsi_reset_dev(&ndp->ndev);
+ } else if (ncm->enable) {
+ /* Need to failover Tx channel */
+ ncsi_update_tx_channel(ndp, nc->package, nc, NULL);
+ }
+ } else if (has_link && nc->package->preferred_channel == nc) {
+ /* Return Tx to preferred channel */
+ ncsi_update_tx_channel(ndp, nc->package, NULL, nc);
+ } else if (has_link) {
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, tmp) {
+ /* Enable Tx on this channel if the current Tx
+ * channel is down.
+ */
+ ncm = &tmp->modes[NCSI_MODE_TX_ENABLE];
+ if (ncm->enable &&
+ !ncsi_channel_has_link(tmp)) {
+ ncsi_update_tx_channel(ndp, nc->package,
+ tmp, nc);
+ break;
+ }
+ }
+ }
+ }
- return ncsi_process_next_channel(ndp);
+ /* Leave configured channels active in a multi-channel scenario so
+ * AEN events are still received.
+ */
+ return 0;
}
static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index bfc43b28c7a6..31359d5e14ad 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -28,6 +28,29 @@
LIST_HEAD(ncsi_dev_list);
DEFINE_SPINLOCK(ncsi_dev_lock);
+bool ncsi_channel_has_link(struct ncsi_channel *channel)
+{
+ return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1);
+}
+
+bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
+ struct ncsi_channel *channel)
+{
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+
+ NCSI_FOR_EACH_PACKAGE(ndp, np)
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ if (nc == channel)
+ continue;
+ if (nc->state == NCSI_CHANNEL_ACTIVE &&
+ ncsi_channel_has_link(nc))
+ return false;
+ }
+
+ return true;
+}
+
static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
{
struct ncsi_dev *nd = &ndp->ndev;
@@ -52,7 +75,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
continue;
}
- if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+ if (ncsi_channel_has_link(nc)) {
spin_unlock_irqrestore(&nc->lock, flags);
nd->link_up = 1;
goto report;
@@ -113,10 +136,8 @@ static void ncsi_channel_monitor(struct timer_list *t)
default:
netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n",
nc->id);
- if (!(ndp->flags & NCSI_DEV_HWA)) {
- ncsi_report_link(ndp, true);
- ndp->flags |= NCSI_DEV_RESHUFFLE;
- }
+ ncsi_report_link(ndp, true);
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
ncsi_stop_channel_monitor(nc);
@@ -269,6 +290,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
np->ndp = ndp;
spin_lock_init(&np->lock);
INIT_LIST_HEAD(&np->channels);
+ np->channel_whitelist = UINT_MAX;
spin_lock_irqsave(&ndp->lock, flags);
tmp = ncsi_find_package(ndp, id);
@@ -442,12 +464,14 @@ static void ncsi_request_timeout(struct timer_list *t)
static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
{
struct ncsi_dev *nd = &ndp->ndev;
- struct ncsi_package *np = ndp->active_package;
- struct ncsi_channel *nc = ndp->active_channel;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc, *tmp;
struct ncsi_cmd_arg nca;
unsigned long flags;
int ret;
+ np = ndp->active_package;
+ nc = ndp->active_channel;
nca.ndp = ndp;
nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
@@ -523,6 +547,15 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
if (ret)
goto error;
+ NCSI_FOR_EACH_CHANNEL(np, tmp) {
+ /* If there is another channel active on this package
+ * do not deselect the package.
+ */
+ if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) {
+ nd->state = ncsi_dev_state_suspend_done;
+ break;
+ }
+ }
break;
case ncsi_dev_state_suspend_deselect:
ndp->pending_req_num = 1;
@@ -541,8 +574,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
spin_lock_irqsave(&nc->lock, flags);
nc->state = NCSI_CHANNEL_INACTIVE;
spin_unlock_irqrestore(&nc->lock, flags);
- ncsi_process_next_channel(ndp);
-
+ if (ndp->flags & NCSI_DEV_RESET)
+ ncsi_reset_dev(nd);
+ else
+ ncsi_process_next_channel(ndp);
break;
default:
netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
@@ -675,12 +710,38 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
return ret;
}
+static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca)
+{
+ union {
+ u8 data_u8[NCSI_OEM_MLX_CMD_GMA_LEN];
+ u32 data_u32[NCSI_OEM_MLX_CMD_GMA_LEN / sizeof(u32)];
+ } u;
+ int ret = 0;
+
+ nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN;
+
+ memset(&u, 0, sizeof(u));
+ u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+ u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA;
+ u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM;
+
+ nca->data = u.data_u8;
+
+ ret = ncsi_xmit_cmd(nca);
+ if (ret)
+ netdev_err(nca->ndp->ndev.dev,
+ "NCSI: Failed to transmit cmd 0x%x during configure\n",
+ nca->type);
+ return ret;
+}
+
/* OEM Command handlers initialization */
static struct ncsi_oem_gma_handler {
unsigned int mfr_id;
int (*handler)(struct ncsi_cmd_arg *nca);
} ncsi_oem_gma_handlers[] = {
- { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
+ { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm },
+ { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }
};
static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
@@ -717,13 +778,144 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+/* Determine if a given channel from the channel_queue should be used for Tx */
+static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp,
+ struct ncsi_channel *nc)
+{
+ struct ncsi_channel_mode *ncm;
+ struct ncsi_channel *channel;
+ struct ncsi_package *np;
+
+ /* Check if any other channel has Tx enabled; a channel may have already
+ * been configured and removed from the channel queue.
+ */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ if (!ndp->multi_package && np != nc->package)
+ continue;
+ NCSI_FOR_EACH_CHANNEL(np, channel) {
+ ncm = &channel->modes[NCSI_MODE_TX_ENABLE];
+ if (ncm->enable)
+ return false;
+ }
+ }
+
+ /* This channel is the preferred channel and has link */
+ list_for_each_entry_rcu(channel, &ndp->channel_queue, link) {
+ np = channel->package;
+ if (np->preferred_channel &&
+ ncsi_channel_has_link(np->preferred_channel)) {
+ return np->preferred_channel == nc;
+ }
+ }
+
+ /* This channel has link */
+ if (ncsi_channel_has_link(nc))
+ return true;
+
+ list_for_each_entry_rcu(channel, &ndp->channel_queue, link)
+ if (ncsi_channel_has_link(channel))
+ return false;
+
+ /* No other channel has link; default to this one */
+ return true;
+}
+
+/* Change the active Tx channel in a multi-channel setup */
+int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp,
+ struct ncsi_package *package,
+ struct ncsi_channel *disable,
+ struct ncsi_channel *enable)
+{
+ struct ncsi_cmd_arg nca;
+ struct ncsi_channel *nc;
+ struct ncsi_package *np;
+ int ret = 0;
+
+ if (!package->multi_channel && !ndp->multi_package)
+ netdev_warn(ndp->ndev.dev,
+ "NCSI: Trying to update Tx channel in single-channel mode\n");
+ nca.ndp = ndp;
+ nca.req_flags = 0;
+
+ /* Find current channel with Tx enabled */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ if (disable)
+ break;
+ if (!ndp->multi_package && np != package)
+ continue;
+
+ NCSI_FOR_EACH_CHANNEL(np, nc)
+ if (nc->modes[NCSI_MODE_TX_ENABLE].enable) {
+ disable = nc;
+ break;
+ }
+ }
+
+ /* Find a suitable channel for Tx */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ if (enable)
+ break;
+ if (!ndp->multi_package && np != package)
+ continue;
+ if (!(ndp->package_whitelist & (0x1 << np->id)))
+ continue;
+
+ if (np->preferred_channel &&
+ ncsi_channel_has_link(np->preferred_channel)) {
+ enable = np->preferred_channel;
+ break;
+ }
+
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ if (!(np->channel_whitelist & 0x1 << nc->id))
+ continue;
+ if (nc->state != NCSI_CHANNEL_ACTIVE)
+ continue;
+ if (ncsi_channel_has_link(nc)) {
+ enable = nc;
+ break;
+ }
+ }
+ }
+
+ if (disable == enable)
+ return -1;
+
+ if (!enable)
+ return -1;
+
+ if (disable) {
+ nca.channel = disable->id;
+ nca.package = disable->package->id;
+ nca.type = NCSI_PKT_CMD_DCNT;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ netdev_err(ndp->ndev.dev,
+ "Error %d sending DCNT\n",
+ ret);
+ }
+
+ netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id);
+
+ nca.channel = enable->id;
+ nca.package = enable->package->id;
+ nca.type = NCSI_PKT_CMD_ECNT;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ netdev_err(ndp->ndev.dev,
+ "Error %d sending ECNT\n",
+ ret);
+
+ return ret;
+}
+
static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
{
- struct ncsi_dev *nd = &ndp->ndev;
- struct net_device *dev = nd->dev;
struct ncsi_package *np = ndp->active_package;
struct ncsi_channel *nc = ndp->active_channel;
struct ncsi_channel *hot_nc = NULL;
+ struct ncsi_dev *nd = &ndp->ndev;
+ struct net_device *dev = nd->dev;
struct ncsi_cmd_arg nca;
unsigned char index;
unsigned long flags;
@@ -845,20 +1037,29 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
} else if (nd->state == ncsi_dev_state_config_ebf) {
nca.type = NCSI_PKT_CMD_EBF;
nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
- nd->state = ncsi_dev_state_config_ecnt;
+ if (ncsi_channel_is_tx(ndp, nc))
+ nd->state = ncsi_dev_state_config_ecnt;
+ else
+ nd->state = ncsi_dev_state_config_ec;
#if IS_ENABLED(CONFIG_IPV6)
if (ndp->inet6_addr_num > 0 &&
(nc->caps[NCSI_CAP_GENERIC].cap &
NCSI_CAP_GENERIC_MC))
nd->state = ncsi_dev_state_config_egmf;
- else
- nd->state = ncsi_dev_state_config_ecnt;
} else if (nd->state == ncsi_dev_state_config_egmf) {
nca.type = NCSI_PKT_CMD_EGMF;
nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
- nd->state = ncsi_dev_state_config_ecnt;
+ if (ncsi_channel_is_tx(ndp, nc))
+ nd->state = ncsi_dev_state_config_ecnt;
+ else
+ nd->state = ncsi_dev_state_config_ec;
#endif /* CONFIG_IPV6 */
} else if (nd->state == ncsi_dev_state_config_ecnt) {
+ if (np->preferred_channel &&
+ nc != np->preferred_channel)
+ netdev_info(ndp->ndev.dev,
+ "NCSI: Tx failed over to channel %u\n",
+ nc->id);
nca.type = NCSI_PKT_CMD_ECNT;
nd->state = ncsi_dev_state_config_ec;
} else if (nd->state == ncsi_dev_state_config_ec) {
@@ -889,6 +1090,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n",
nc->id);
spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_ACTIVE;
+
+ if (ndp->flags & NCSI_DEV_RESET) {
+ /* A reset event happened during config, start it now */
+ nc->reconfigure_needed = false;
+ spin_unlock_irqrestore(&nc->lock, flags);
+ ncsi_reset_dev(nd);
+ break;
+ }
+
if (nc->reconfigure_needed) {
/* This channel's configuration has been updated
* part-way during the config state - start the
@@ -909,10 +1120,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
hot_nc = nc;
- nc->state = NCSI_CHANNEL_ACTIVE;
} else {
hot_nc = NULL;
- nc->state = NCSI_CHANNEL_INACTIVE;
netdev_dbg(ndp->ndev.dev,
"NCSI: channel %u link down after config\n",
nc->id);
@@ -940,43 +1149,35 @@ error:
static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
{
- struct ncsi_package *np, *force_package;
- struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
+ struct ncsi_channel *nc, *found, *hot_nc;
struct ncsi_channel_mode *ncm;
- unsigned long flags;
+ unsigned long flags, cflags;
+ struct ncsi_package *np;
+ bool with_link;
spin_lock_irqsave(&ndp->lock, flags);
hot_nc = ndp->hot_channel;
- force_channel = ndp->force_channel;
- force_package = ndp->force_package;
spin_unlock_irqrestore(&ndp->lock, flags);
- /* Force a specific channel whether or not it has link if we have been
- * configured to do so
- */
- if (force_package && force_channel) {
- found = force_channel;
- ncm = &found->modes[NCSI_MODE_LINK];
- if (!(ncm->data[2] & 0x1))
- netdev_info(ndp->ndev.dev,
- "NCSI: Channel %u forced, but it is link down\n",
- found->id);
- goto out;
- }
-
- /* The search is done once an inactive channel with up
- * link is found.
+ /* By default the search is done once an inactive channel with up
+ * link is found, unless a preferred channel is set.
+ * If multi_package or multi_channel are configured all channels in the
+ * whitelist are added to the channel queue.
*/
found = NULL;
+ with_link = false;
NCSI_FOR_EACH_PACKAGE(ndp, np) {
- if (ndp->force_package && np != ndp->force_package)
+ if (!(ndp->package_whitelist & (0x1 << np->id)))
continue;
NCSI_FOR_EACH_CHANNEL(np, nc) {
- spin_lock_irqsave(&nc->lock, flags);
+ if (!(np->channel_whitelist & (0x1 << nc->id)))
+ continue;
+
+ spin_lock_irqsave(&nc->lock, cflags);
if (!list_empty(&nc->link) ||
nc->state != NCSI_CHANNEL_INACTIVE) {
- spin_unlock_irqrestore(&nc->lock, flags);
+ spin_unlock_irqrestore(&nc->lock, cflags);
continue;
}
@@ -988,32 +1189,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
ncm = &nc->modes[NCSI_MODE_LINK];
if (ncm->data[2] & 0x1) {
- spin_unlock_irqrestore(&nc->lock, flags);
found = nc;
- goto out;
+ with_link = true;
}
- spin_unlock_irqrestore(&nc->lock, flags);
+ /* If multi_channel is enabled configure all valid
+ * channels whether or not they currently have link
+ * so they will have AENs enabled.
+ */
+ if (with_link || np->multi_channel) {
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&nc->link,
+ &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ netdev_dbg(ndp->ndev.dev,
+ "NCSI: Channel %u added to queue (link %s)\n",
+ nc->id,
+ ncm->data[2] & 0x1 ? "up" : "down");
+ }
+
+ spin_unlock_irqrestore(&nc->lock, cflags);
+
+ if (with_link && !np->multi_channel)
+ break;
}
+ if (with_link && !ndp->multi_package)
+ break;
}
- if (!found) {
+ if (list_empty(&ndp->channel_queue) && found) {
+ netdev_info(ndp->ndev.dev,
+ "NCSI: No channel with link found, configuring channel %u\n",
+ found->id);
+ spin_lock_irqsave(&ndp->lock, flags);
+ list_add_tail_rcu(&found->link, &ndp->channel_queue);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ } else if (!found) {
netdev_warn(ndp->ndev.dev,
- "NCSI: No channel found with link\n");
+ "NCSI: No channel found to configure!\n");
ncsi_report_link(ndp, true);
return -ENODEV;
}
- ncm = &found->modes[NCSI_MODE_LINK];
- netdev_dbg(ndp->ndev.dev,
- "NCSI: Channel %u added to queue (link %s)\n",
- found->id, ncm->data[2] & 0x1 ? "up" : "down");
-
-out:
- spin_lock_irqsave(&ndp->lock, flags);
- list_add_tail_rcu(&found->link, &ndp->channel_queue);
- spin_unlock_irqrestore(&ndp->lock, flags);
-
return ncsi_process_next_channel(ndp);
}
@@ -1050,35 +1268,6 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
return false;
}
-static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp)
-{
- struct ncsi_package *np;
- struct ncsi_channel *nc;
- unsigned long flags;
-
- /* Move all available channels to processing queue */
- spin_lock_irqsave(&ndp->lock, flags);
- NCSI_FOR_EACH_PACKAGE(ndp, np) {
- NCSI_FOR_EACH_CHANNEL(np, nc) {
- WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE ||
- !list_empty(&nc->link));
- ncsi_stop_channel_monitor(nc);
- list_add_tail_rcu(&nc->link, &ndp->channel_queue);
- }
- }
- spin_unlock_irqrestore(&ndp->lock, flags);
-
- /* We can have no channels in extremely case */
- if (list_empty(&ndp->channel_queue)) {
- netdev_err(ndp->ndev.dev,
- "NCSI: No available channels for HWA\n");
- ncsi_report_link(ndp, false);
- return -ENOENT;
- }
-
- return ncsi_process_next_channel(ndp);
-}
-
static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
{
struct ncsi_dev *nd = &ndp->ndev;
@@ -1110,70 +1299,28 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nd->state = ncsi_dev_state_probe_package;
break;
case ncsi_dev_state_probe_package:
- ndp->pending_req_num = 16;
+ ndp->pending_req_num = 1;
- /* Select all possible packages */
nca.type = NCSI_PKT_CMD_SP;
nca.bytes[0] = 1;
+ nca.package = ndp->package_probe_id;
nca.channel = NCSI_RESERVED_CHANNEL;
- for (index = 0; index < 8; index++) {
- nca.package = index;
- ret = ncsi_xmit_cmd(&nca);
- if (ret)
- goto error;
- }
-
- /* Disable all possible packages */
- nca.type = NCSI_PKT_CMD_DP;
- for (index = 0; index < 8; index++) {
- nca.package = index;
- ret = ncsi_xmit_cmd(&nca);
- if (ret)
- goto error;
- }
-
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
nd->state = ncsi_dev_state_probe_channel;
break;
case ncsi_dev_state_probe_channel:
- if (!ndp->active_package)
- ndp->active_package = list_first_or_null_rcu(
- &ndp->packages, struct ncsi_package, node);
- else if (list_is_last(&ndp->active_package->node,
- &ndp->packages))
- ndp->active_package = NULL;
- else
- ndp->active_package = list_next_entry(
- ndp->active_package, node);
-
- /* All available packages and channels are enumerated. The
- * enumeration happens for once when the NCSI interface is
- * started. So we need continue to start the interface after
- * the enumeration.
- *
- * We have to choose an active channel before configuring it.
- * Note that we possibly don't have active channel in extreme
- * situation.
- */
+ ndp->active_package = ncsi_find_package(ndp,
+ ndp->package_probe_id);
if (!ndp->active_package) {
- ndp->flags |= NCSI_DEV_PROBED;
- if (ncsi_check_hwa(ndp))
- ncsi_enable_hwa(ndp);
- else
- ncsi_choose_active_channel(ndp);
- return;
+ /* No response */
+ nd->state = ncsi_dev_state_probe_dp;
+ schedule_work(&ndp->work);
+ break;
}
-
- /* Select the active package */
- ndp->pending_req_num = 1;
- nca.type = NCSI_PKT_CMD_SP;
- nca.bytes[0] = 1;
- nca.package = ndp->active_package->id;
- nca.channel = NCSI_RESERVED_CHANNEL;
- ret = ncsi_xmit_cmd(&nca);
- if (ret)
- goto error;
-
nd->state = ncsi_dev_state_probe_cis;
+ schedule_work(&ndp->work);
break;
case ncsi_dev_state_probe_cis:
ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
@@ -1222,22 +1369,35 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
case ncsi_dev_state_probe_dp:
ndp->pending_req_num = 1;
- /* Deselect the active package */
+ /* Deselect the current package */
nca.type = NCSI_PKT_CMD_DP;
- nca.package = ndp->active_package->id;
+ nca.package = ndp->package_probe_id;
nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
- /* Scan channels in next package */
- nd->state = ncsi_dev_state_probe_channel;
+ /* Probe next package */
+ ndp->package_probe_id++;
+ if (ndp->package_probe_id >= 8) {
+ /* Probe finished */
+ ndp->flags |= NCSI_DEV_PROBED;
+ break;
+ }
+ nd->state = ncsi_dev_state_probe_package;
+ ndp->active_package = NULL;
break;
default:
netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n",
nd->state);
}
+ if (ndp->flags & NCSI_DEV_PROBED) {
+ /* Check if all packages have HWA support */
+ ncsi_check_hwa(ndp);
+ ncsi_choose_active_channel(ndp);
+ }
+
return;
error:
netdev_err(ndp->ndev.dev,
@@ -1556,6 +1716,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
INIT_LIST_HEAD(&ndp->channel_queue);
INIT_LIST_HEAD(&ndp->vlan_vids);
INIT_WORK(&ndp->work, ncsi_dev_work);
+ ndp->package_whitelist = UINT_MAX;
/* Initialize private NCSI device */
spin_lock_init(&ndp->lock);
@@ -1592,26 +1753,19 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev);
int ncsi_start_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
- int ret;
if (nd->state != ncsi_dev_state_registered &&
nd->state != ncsi_dev_state_functional)
return -ENOTTY;
if (!(ndp->flags & NCSI_DEV_PROBED)) {
+ ndp->package_probe_id = 0;
nd->state = ncsi_dev_state_probe;
schedule_work(&ndp->work);
return 0;
}
- if (ndp->flags & NCSI_DEV_HWA) {
- netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n");
- ret = ncsi_enable_hwa(ndp);
- } else {
- ret = ncsi_choose_active_channel(ndp);
- }
-
- return ret;
+ return ncsi_reset_dev(nd);
}
EXPORT_SYMBOL_GPL(ncsi_start_dev);
@@ -1624,7 +1778,10 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
int old_state;
unsigned long flags;
- /* Stop the channel monitor and reset channel's state */
+ /* Stop the channel monitor on any active channels. Don't reset the
+ * channel state so we know which were active when ncsi_start_dev()
+ * is next called.
+ */
NCSI_FOR_EACH_PACKAGE(ndp, np) {
NCSI_FOR_EACH_CHANNEL(np, nc) {
ncsi_stop_channel_monitor(nc);
@@ -1632,7 +1789,6 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
spin_lock_irqsave(&nc->lock, flags);
chained = !list_empty(&nc->link);
old_state = nc->state;
- nc->state = NCSI_CHANNEL_INACTIVE;
spin_unlock_irqrestore(&nc->lock, flags);
WARN_ON_ONCE(chained ||
@@ -1645,6 +1801,92 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
}
EXPORT_SYMBOL_GPL(ncsi_stop_dev);
+int ncsi_reset_dev(struct ncsi_dev *nd)
+{
+ struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+ struct ncsi_channel *nc, *active, *tmp;
+ struct ncsi_package *np;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ndp->lock, flags);
+
+ if (!(ndp->flags & NCSI_DEV_RESET)) {
+ /* Haven't been called yet, check states */
+ switch (nd->state & ncsi_dev_state_major) {
+ case ncsi_dev_state_registered:
+ case ncsi_dev_state_probe:
+ /* Not even probed yet - do nothing */
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return 0;
+ case ncsi_dev_state_suspend:
+ case ncsi_dev_state_config:
+ /* Wait for the channel to finish its suspend/config
+ * operation; once it finishes it will check for
+ * NCSI_DEV_RESET and reset the state.
+ */
+ ndp->flags |= NCSI_DEV_RESET;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return 0;
+ }
+ } else {
+ switch (nd->state) {
+ case ncsi_dev_state_suspend_done:
+ case ncsi_dev_state_config_done:
+ case ncsi_dev_state_functional:
+ /* Ok */
+ break;
+ default:
+ /* Current reset operation happening */
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return 0;
+ }
+ }
+
+ if (!list_empty(&ndp->channel_queue)) {
+ /* Clear any channel queue we may have interrupted */
+ list_for_each_entry_safe(nc, tmp, &ndp->channel_queue, link)
+ list_del_init(&nc->link);
+ }
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ active = NULL;
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+
+ if (nc->state == NCSI_CHANNEL_ACTIVE) {
+ active = nc;
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+ ncsi_stop_channel_monitor(nc);
+ break;
+ }
+
+ spin_unlock_irqrestore(&nc->lock, flags);
+ }
+ if (active)
+ break;
+ }
+
+ if (!active) {
+ /* Done */
+ spin_lock_irqsave(&ndp->lock, flags);
+ ndp->flags &= ~NCSI_DEV_RESET;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+ return ncsi_choose_active_channel(ndp);
+ }
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ ndp->flags |= NCSI_DEV_RESET;
+ ndp->active_channel = active;
+ ndp->active_package = active->package;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ nd->state = ncsi_dev_state_suspend;
+ schedule_work(&ndp->work);
+ return 0;
+}
+
void ncsi_unregister_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 33314381b4f5..5d782445d2fc 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -30,6 +30,9 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
[NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 },
[NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 },
[NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 },
+ [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG },
+ [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 },
+ [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 },
};
static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -69,7 +72,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
if (nc->state == NCSI_CHANNEL_ACTIVE)
nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
- if (ndp->force_channel == nc)
+ if (nc == nc->package->preferred_channel)
nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
@@ -114,7 +117,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
if (!pnest)
return -ENOMEM;
nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
- if (ndp->force_package == np)
+ if ((0x1 << np->id) == ndp->package_whitelist)
nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
if (!cnest) {
@@ -290,49 +293,58 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
package = NULL;
- spin_lock_irqsave(&ndp->lock, flags);
-
NCSI_FOR_EACH_PACKAGE(ndp, np)
if (np->id == package_id)
package = np;
if (!package) {
/* The user has set a package that does not exist */
- spin_unlock_irqrestore(&ndp->lock, flags);
return -ERANGE;
}
channel = NULL;
- if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
- /* Allow any channel */
- channel_id = NCSI_RESERVED_CHANNEL;
- } else {
+ if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
NCSI_FOR_EACH_CHANNEL(package, nc)
- if (nc->id == channel_id)
+ if (nc->id == channel_id) {
channel = nc;
+ break;
+ }
+ if (!channel) {
+ netdev_info(ndp->ndev.dev,
+ "NCSI: Channel %u does not exist!\n",
+ channel_id);
+ return -ERANGE;
+ }
}
- if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
- /* The user has set a channel that does not exist on this
- * package
- */
- spin_unlock_irqrestore(&ndp->lock, flags);
- netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
- channel_id);
- return -ERANGE;
- }
-
- ndp->force_package = package;
- ndp->force_channel = channel;
+ spin_lock_irqsave(&ndp->lock, flags);
+ ndp->package_whitelist = 0x1 << package->id;
+ ndp->multi_package = false;
spin_unlock_irqrestore(&ndp->lock, flags);
- netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
- package_id, channel_id,
- channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
+ spin_lock_irqsave(&package->lock, flags);
+ package->multi_channel = false;
+ if (channel) {
+ package->channel_whitelist = 0x1 << channel->id;
+ package->preferred_channel = channel;
+ } else {
+ /* Allow any channel */
+ package->channel_whitelist = UINT_MAX;
+ package->preferred_channel = NULL;
+ }
+ spin_unlock_irqrestore(&package->lock, flags);
+
+ if (channel)
+ netdev_info(ndp->ndev.dev,
+ "Set package 0x%x, channel 0x%x as preferred\n",
+ package_id, channel_id);
+ else
+ netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n",
+ package_id);
- /* Bounce the NCSI channel to set changes */
- ncsi_stop_dev(&ndp->ndev);
- ncsi_start_dev(&ndp->ndev);
+ /* Update channel configuration */
+ if (!(ndp->flags & NCSI_DEV_RESET))
+ ncsi_reset_dev(&ndp->ndev);
return 0;
}
@@ -340,6 +352,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
{
struct ncsi_dev_priv *ndp;
+ struct ncsi_package *np;
unsigned long flags;
if (!info || !info->attrs)
@@ -353,16 +366,24 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
if (!ndp)
return -ENODEV;
- /* Clear any override */
+ /* Reset any whitelists and disable multi mode */
spin_lock_irqsave(&ndp->lock, flags);
- ndp->force_package = NULL;
- ndp->force_channel = NULL;
+ ndp->package_whitelist = UINT_MAX;
+ ndp->multi_package = false;
spin_unlock_irqrestore(&ndp->lock, flags);
+
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ spin_lock_irqsave(&np->lock, flags);
+ np->multi_channel = false;
+ np->channel_whitelist = UINT_MAX;
+ np->preferred_channel = NULL;
+ spin_unlock_irqrestore(&np->lock, flags);
+ }
netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
- /* Bounce the NCSI channel to set changes */
- ncsi_stop_dev(&ndp->ndev);
- ncsi_start_dev(&ndp->ndev);
+ /* Update channel configuration */
+ if (!(ndp->flags & NCSI_DEV_RESET))
+ ncsi_reset_dev(&ndp->ndev);
return 0;
}
@@ -563,6 +584,138 @@ int ncsi_send_netlink_err(struct net_device *dev,
return nlmsg_unicast(net->genl_sock, skb, snd_portid);
}
+static int ncsi_set_package_mask_nl(struct sk_buff *msg,
+ struct genl_info *info)
+{
+ struct ncsi_dev_priv *ndp;
+ unsigned long flags;
+ int rc;
+
+ if (!info || !info->attrs)
+ return -EINVAL;
+
+ if (!info->attrs[NCSI_ATTR_IFINDEX])
+ return -EINVAL;
+
+ if (!info->attrs[NCSI_ATTR_PACKAGE_MASK])
+ return -EINVAL;
+
+ ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+ nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+ if (!ndp)
+ return -ENODEV;
+
+ spin_lock_irqsave(&ndp->lock, flags);
+ if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
+ if (ndp->flags & NCSI_DEV_HWA) {
+ ndp->multi_package = true;
+ rc = 0;
+ } else {
+ netdev_err(ndp->ndev.dev,
+ "NCSI: Can't use multiple packages without HWA\n");
+ rc = -EPERM;
+ }
+ } else {
+ ndp->multi_package = false;
+ rc = 0;
+ }
+
+ if (!rc)
+ ndp->package_whitelist =
+ nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]);
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
+ if (!rc) {
+ /* Update channel configuration */
+ if (!(ndp->flags & NCSI_DEV_RESET))
+ ncsi_reset_dev(&ndp->ndev);
+ }
+
+ return rc;
+}
+
+static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
+ struct genl_info *info)
+{
+ struct ncsi_package *np, *package;
+ struct ncsi_channel *nc, *channel;
+ u32 package_id, channel_id;
+ struct ncsi_dev_priv *ndp;
+ unsigned long flags;
+
+ if (!info || !info->attrs)
+ return -EINVAL;
+
+ if (!info->attrs[NCSI_ATTR_IFINDEX])
+ return -EINVAL;
+
+ if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
+ return -EINVAL;
+
+ if (!info->attrs[NCSI_ATTR_CHANNEL_MASK])
+ return -EINVAL;
+
+ ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+ nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+ if (!ndp)
+ return -ENODEV;
+
+ package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+ package = NULL;
+ NCSI_FOR_EACH_PACKAGE(ndp, np)
+ if (np->id == package_id) {
+ package = np;
+ break;
+ }
+ if (!package)
+ return -ERANGE;
+
+ spin_lock_irqsave(&package->lock, flags);
+
+ channel = NULL;
+ if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+ channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+ NCSI_FOR_EACH_CHANNEL(np, nc)
+ if (nc->id == channel_id) {
+ channel = nc;
+ break;
+ }
+ if (!channel) {
+ spin_unlock_irqrestore(&package->lock, flags);
+ return -ERANGE;
+ }
+ netdev_dbg(ndp->ndev.dev,
+ "NCSI: Channel %u set as preferred channel\n",
+ channel->id);
+ }
+
+ package->channel_whitelist =
+ nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]);
+ if (package->channel_whitelist == 0)
+ netdev_dbg(ndp->ndev.dev,
+ "NCSI: Package %u set to all channels disabled\n",
+ package->id);
+
+ package->preferred_channel = channel;
+
+ if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
+ package->multi_channel = true;
+ netdev_info(ndp->ndev.dev,
+ "NCSI: Multi-channel enabled on package %u\n",
+ package_id);
+ } else {
+ package->multi_channel = false;
+ }
+
+ spin_unlock_irqrestore(&package->lock, flags);
+
+ /* Update channel configuration */
+ if (!(ndp->flags & NCSI_DEV_RESET))
+ ncsi_reset_dev(&ndp->ndev);
+
+ return 0;
+}
+
static const struct genl_ops ncsi_ops[] = {
{
.cmd = NCSI_CMD_PKG_INFO,
@@ -589,6 +742,18 @@ static const struct genl_ops ncsi_ops[] = {
.doit = ncsi_send_cmd_nl,
.flags = GENL_ADMIN_PERM,
},
+ {
+ .cmd = NCSI_CMD_SET_PACKAGE_MASK,
+ .policy = ncsi_genl_policy,
+ .doit = ncsi_set_package_mask_nl,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NCSI_CMD_SET_CHANNEL_MASK,
+ .policy = ncsi_genl_policy,
+ .doit = ncsi_set_channel_mask_nl,
+ .flags = GENL_ADMIN_PERM,
+ },
};
static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 4d3f06be38bd..2a6d83a596c9 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -165,6 +165,15 @@ struct ncsi_rsp_oem_pkt {
unsigned char data[]; /* Payload data */
};
+/* Mellanox Response Data */
+struct ncsi_rsp_oem_mlx_pkt {
+ unsigned char cmd_rev; /* Command Revision */
+ unsigned char cmd; /* Command ID */
+ unsigned char param; /* Parameter */
+ unsigned char optional; /* Optional data */
+ unsigned char data[]; /* Data */
+};
+
/* Broadcom Response Data */
struct ncsi_rsp_oem_bcm_pkt {
unsigned char ver; /* Payload Version */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 77e07ba3f493..dc07fcc7938e 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -256,7 +256,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
if (!ncm->enable)
return 0;
- ncm->enable = 1;
+ ncm->enable = 0;
return 0;
}
@@ -611,6 +611,45 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
return 0;
}
+/* Response handler for Mellanox command Get Mac Address */
+static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr)
+{
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct net_device *ndev = ndp->ndev.dev;
+ const struct net_device_ops *ops = ndev->netdev_ops;
+ struct ncsi_rsp_oem_pkt *rsp;
+ struct sockaddr saddr;
+ int ret = 0;
+
+ /* Get the response header */
+ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+
+ saddr.sa_family = ndev->type;
+ ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN);
+ ret = ops->ndo_set_mac_address(ndev, &saddr);
+ if (ret < 0)
+ netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
+
+ return ret;
+}
+
+/* Response handler for Mellanox card */
+static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_oem_mlx_pkt *mlx;
+ struct ncsi_rsp_oem_pkt *rsp;
+
+ /* Get the response header */
+ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+ mlx = (struct ncsi_rsp_oem_mlx_pkt *)(rsp->data);
+
+ if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA &&
+ mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM)
+ return ncsi_rsp_handler_oem_mlx_gma(nr);
+ return 0;
+}
+
/* Response handler for Broadcom command Get Mac Address */
static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
{
@@ -655,7 +694,7 @@ static struct ncsi_rsp_oem_handler {
unsigned int mfr_id;
int (*handler)(struct ncsi_request *nr);
} ncsi_rsp_oem_handlers[] = {
- { NCSI_OEM_MFR_MLX_ID, NULL },
+ { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx },
{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
};
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2ab870ef233a..beb3a69ce1d4 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -403,21 +403,6 @@ config NF_NAT_NEEDED
depends on NF_NAT
default y
-config NF_NAT_PROTO_DCCP
- bool
- depends on NF_NAT && NF_CT_PROTO_DCCP
- default NF_NAT && NF_CT_PROTO_DCCP
-
-config NF_NAT_PROTO_UDPLITE
- bool
- depends on NF_NAT && NF_CT_PROTO_UDPLITE
- default NF_NAT && NF_CT_PROTO_UDPLITE
-
-config NF_NAT_PROTO_SCTP
- bool
- default NF_NAT && NF_CT_PROTO_SCTP
- depends on NF_NAT && NF_CT_PROTO_SCTP
-
config NF_NAT_AMANDA
tristate
depends on NF_CONNTRACK && NF_NAT
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4ddf3ef51ece..1ae65a314d7a 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -47,12 +47,7 @@ obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
-nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
- nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
-
-# NAT protocols (nf_nat)
-nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
-nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o
# generic transport layer logging
obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index c00b6a2e8e3c..980000fc3b50 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -219,10 +219,6 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
u32 ip;
- /* MAC can be src only */
- if (!(opt->flags & IPSET_DIM_TWO_SRC))
- return 0;
-
ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
if (ip < map->first_ip || ip > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
@@ -233,7 +229,14 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
return -EINVAL;
e.id = ip_to_id(map, ip);
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+
+ if (opt->flags & IPSET_DIM_ONE_SRC)
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
+ else
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
+
+ if (is_zero_ether_addr(e.ether))
+ return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 1577f2f76060..45a257695bef 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -771,11 +771,21 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
* The commands are serialized by the nfnl mutex.
*/
+static inline u8 protocol(const struct nlattr * const tb[])
+{
+ return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]);
+}
+
static inline bool
protocol_failed(const struct nlattr * const tb[])
{
- return !tb[IPSET_ATTR_PROTOCOL] ||
- nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+ return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL;
+}
+
+static inline bool
+protocol_min_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN;
}
static inline u32
@@ -889,7 +899,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
u32 flags = flag_exist(nlh);
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!attr[IPSET_ATTR_TYPENAME] ||
!attr[IPSET_ATTR_REVISION] ||
@@ -1027,7 +1037,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
ip_set_id_t i;
int ret = 0;
- if (unlikely(protocol_failed(attr)))
+ if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
/* Must wait for flush to be really finished in list:set */
@@ -1105,7 +1115,7 @@ static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
struct ip_set *s;
ip_set_id_t i;
- if (unlikely(protocol_failed(attr)))
+ if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
if (!attr[IPSET_ATTR_SETNAME]) {
@@ -1147,7 +1157,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
ip_set_id_t i;
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
@@ -1196,7 +1206,7 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
ip_set_id_t from_id, to_id;
char from_name[IPSET_MAXNAMELEN];
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
@@ -1291,6 +1301,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len,
ip_set_setname_policy, NULL);
+ cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
if (cda[IPSET_ATTR_SETNAME]) {
struct ip_set *set;
@@ -1392,7 +1403,8 @@ dump_last:
ret = -EMSGSIZE;
goto release_refcount;
}
- if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL,
+ cb->args[IPSET_CB_PROTO]) ||
nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
goto nla_put_failure;
if (dump_flags & IPSET_FLAG_LIST_SETNAME)
@@ -1407,6 +1419,9 @@ dump_last:
nla_put_u8(skb, IPSET_ATTR_REVISION,
set->revision))
goto nla_put_failure;
+ if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN &&
+ nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index)))
+ goto nla_put_failure;
ret = set->variant->head(set, skb);
if (ret < 0)
goto release_refcount;
@@ -1466,7 +1481,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
const struct nlattr * const attr[],
struct netlink_ext_ack *extack)
{
- if (unlikely(protocol_failed(attr)))
+ if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
{
@@ -1560,7 +1575,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
bool use_lineno;
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
@@ -1615,7 +1630,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
bool use_lineno;
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
@@ -1667,7 +1682,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
!attr[IPSET_ATTR_DATA] ||
!flag_nested(attr[IPSET_ATTR_DATA])))
@@ -1704,7 +1719,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
struct nlmsghdr *nlh2;
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
return -IPSET_ERR_PROTOCOL;
@@ -1720,7 +1735,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
IPSET_CMD_HEADER);
if (!nlh2)
goto nlmsg_failure;
- if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
@@ -1761,7 +1776,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
const char *typename;
int ret = 0;
- if (unlikely(protocol_failed(attr) ||
+ if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_TYPENAME] ||
!attr[IPSET_ATTR_FAMILY]))
return -IPSET_ERR_PROTOCOL;
@@ -1780,7 +1795,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
IPSET_CMD_TYPE);
if (!nlh2)
goto nlmsg_failure;
- if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
@@ -1831,6 +1846,111 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
goto nlmsg_failure;
if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
goto nla_put_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get set by name or index, from userspace */
+
+static int ip_set_byname(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ ip_set_id_t id = IPSET_INVALID_ID;
+ const struct ip_set *set;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ !attr[IPSET_ATTR_SETNAME]))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id);
+ if (id == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_GET_BYNAME);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
+ nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id)))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_INDEX] = { .type = NLA_U16 },
+};
+
+static int ip_set_byindex(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ ip_set_id_t id = IPSET_INVALID_ID;
+ const struct ip_set *set;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ !attr[IPSET_ATTR_INDEX]))
+ return -IPSET_ERR_PROTOCOL;
+
+ id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]);
+ if (id >= inst->ip_set_max)
+ return -ENOENT;
+ set = ip_set(inst, id);
+ if (set == NULL)
+ return -ENOENT;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_GET_BYINDEX);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
+ nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name))
+ goto nla_put_failure;
nlmsg_end(skb2, nlh2);
ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
@@ -1916,6 +2036,16 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_protocol_policy,
},
+ [IPSET_CMD_GET_BYNAME] = {
+ .call = ip_set_byname,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_GET_BYINDEX] = {
+ .call = ip_set_byindex,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_index_policy,
+ },
};
static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
@@ -1961,7 +2091,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
goto done;
}
- if (req_version->version != IPSET_PROTOCOL) {
+ if (req_version->version < IPSET_PROTOCOL_MIN) {
ret = -EPROTO;
goto done;
}
@@ -2024,9 +2154,11 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
}
nfnl_lock(NFNL_SUBSYS_IPSET);
set = ip_set(inst, req_get->set.index);
- strncpy(req_get->set.name, set ? set->name : "",
- IPSET_MAXNAMELEN);
+ ret = strscpy(req_get->set.name, set ? set->name : "",
+ IPSET_MAXNAMELEN);
nfnl_unlock(NFNL_SUBSYS_IPSET);
+ if (ret < 0)
+ goto done;
goto copy;
}
default:
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index e287da68d5fa..2c9609929c71 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -67,7 +67,7 @@ tune_ahash_max(u8 curr, u32 multi)
/* A hash bucket */
struct hbucket {
- struct rcu_head rcu; /* for call_rcu_bh */
+ struct rcu_head rcu; /* for call_rcu */
/* Which positions are used in the array */
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
@@ -664,7 +664,7 @@ retry:
spin_unlock_bh(&set->lock);
/* Give time to other readers of the set */
- synchronize_rcu_bh();
+ synchronize_rcu();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index 1ab5ed2f6839..c830c68142ff 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -36,9 +36,6 @@ MODULE_ALIAS("ip_set_hash:ip,mac");
/* Type specific function prefix */
#define HTYPE hash_ipmac
-/* Zero valued element is not supported */
-static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
-
/* IPv4 variant */
/* Member elements */
@@ -103,8 +100,12 @@ hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb,
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
- if (ether_addr_equal(e.ether, invalid_ether))
+ if (opt->flags & IPSET_DIM_ONE_SRC)
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
+ else
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
+
+ if (is_zero_ether_addr(e.ether))
return -EINVAL;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
@@ -140,7 +141,7 @@ hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
- if (ether_addr_equal(e.ether, invalid_ether))
+ if (is_zero_ether_addr(e.ether))
return -IPSET_ERR_HASH_ELEM;
return adtfn(set, &e, &ext, &ext, flags);
@@ -211,16 +212,16 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- /* MAC can be src only */
- if (!(opt->flags & IPSET_DIM_TWO_SRC))
- return 0;
-
if (skb_mac_header(skb) < skb->head ||
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
- if (ether_addr_equal(e.ether, invalid_ether))
+ if (opt->flags & IPSET_DIM_ONE_SRC)
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
+ else
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
+
+ if (is_zero_ether_addr(e.ether))
return -EINVAL;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
@@ -260,7 +261,7 @@ hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
- if (ether_addr_equal(e.ether, invalid_ether))
+ if (is_zero_ether_addr(e.ether))
return -IPSET_ERR_HASH_ELEM;
return adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index f9d5a2a1e3d0..4fe5f243d0a3 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -81,15 +81,15 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- /* MAC can be src only */
- if (!(opt->flags & IPSET_DIM_ONE_SRC))
- return 0;
-
if (skb_mac_header(skb) < skb->head ||
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
+ if (opt->flags & IPSET_DIM_ONE_SRC)
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
+ else
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
+
if (is_zero_ether_addr(e.ether))
return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 4eef55da0878..8da228da53ae 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -531,8 +531,8 @@ nla_put_failure:
ret = -EMSGSIZE;
} else {
cb->args[IPSET_CB_ARG0] = i;
+ ipset_nest_end(skb, atd);
}
- ipset_nest_end(skb, atd);
out:
rcu_read_unlock();
return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 83395bf6dc35..432141f04af3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3980,6 +3980,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
+#ifdef CONFIG_IP_VS_IPV6
+ .priority = ADDRCONF_NOTIFY_PRIORITY + 5,
+#endif
};
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 02ca7df793f5..9cd180bda092 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -49,6 +49,7 @@ struct nf_conncount_tuple {
struct nf_conntrack_zone zone;
int cpu;
u32 jiffies32;
+ bool dead;
struct rcu_head rcu_head;
};
@@ -106,15 +107,16 @@ nf_conncount_add(struct nf_conncount_list *list,
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
- spin_lock(&list->list_lock);
+ conn->dead = false;
+ spin_lock_bh(&list->list_lock);
if (list->dead == true) {
kmem_cache_free(conncount_conn_cachep, conn);
- spin_unlock(&list->list_lock);
+ spin_unlock_bh(&list->list_lock);
return NF_CONNCOUNT_SKIP;
}
list_add_tail(&conn->node, &list->head);
list->count++;
- spin_unlock(&list->list_lock);
+ spin_unlock_bh(&list->list_lock);
return NF_CONNCOUNT_ADDED;
}
EXPORT_SYMBOL_GPL(nf_conncount_add);
@@ -132,19 +134,22 @@ static bool conn_free(struct nf_conncount_list *list,
{
bool free_entry = false;
- spin_lock(&list->list_lock);
+ spin_lock_bh(&list->list_lock);
- if (list->count == 0) {
- spin_unlock(&list->list_lock);
- return free_entry;
+ if (conn->dead) {
+ spin_unlock_bh(&list->list_lock);
+ return free_entry;
}
list->count--;
+ conn->dead = true;
list_del_rcu(&conn->node);
- if (list->count == 0)
+ if (list->count == 0) {
+ list->dead = true;
free_entry = true;
+ }
- spin_unlock(&list->list_lock);
+ spin_unlock_bh(&list->list_lock);
call_rcu(&conn->rcu_head, __conn_free);
return free_entry;
}
@@ -245,7 +250,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
{
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
- list->count = 1;
+ list->count = 0;
list->dead = false;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
@@ -259,6 +264,7 @@ bool nf_conncount_gc_list(struct net *net,
struct nf_conn *found_ct;
unsigned int collected = 0;
bool free_entry = false;
+ bool ret = false;
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
found = find_or_evict(net, list, conn, &free_entry);
@@ -288,7 +294,15 @@ bool nf_conncount_gc_list(struct net *net,
if (collected > CONNCOUNT_GC_MAX_NODES)
return false;
}
- return false;
+
+ spin_lock_bh(&list->list_lock);
+ if (!list->count) {
+ list->dead = true;
+ ret = true;
+ }
+ spin_unlock_bh(&list->list_lock);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
@@ -309,11 +323,8 @@ static void tree_nodes_free(struct rb_root *root,
while (gc_count) {
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
- if (rbconn->list.count == 0 && rbconn->list.dead == false) {
- rbconn->list.dead = true;
- rb_erase(&rbconn->node, root);
- call_rcu(&rbconn->rcu_head, __tree_nodes_free);
- }
+ rb_erase(&rbconn->node, root);
+ call_rcu(&rbconn->rcu_head, __tree_nodes_free);
spin_unlock(&rbconn->list.list_lock);
}
}
@@ -414,8 +425,9 @@ insert_tree(struct net *net,
nf_conncount_list_init(&rbconn->list);
list_add(&conn->node, &rbconn->list.head);
count = 1;
+ rbconn->list.count = count;
- rb_link_node(&rbconn->node, parent, rbnode);
+ rb_link_node_rcu(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, root);
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 1d66de5151b2..49e523cc49d0 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -25,102 +25,15 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-#ifdef CONFIG_SYSCTL
-static struct ctl_table acct_sysctl_table[] = {
- {
- .procname = "nf_conntrack_acct",
- .data = &init_net.ct.sysctl_acct,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {}
-};
-#endif /* CONFIG_SYSCTL */
-
-unsigned int
-seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
-{
- struct nf_conn_acct *acct;
- struct nf_conn_counter *counter;
-
- acct = nf_conn_acct_find(ct);
- if (!acct)
- return 0;
-
- counter = acct->counter;
- seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)atomic64_read(&counter[dir].packets),
- (unsigned long long)atomic64_read(&counter[dir].bytes));
-
- return 0;
-};
-EXPORT_SYMBOL_GPL(seq_print_acct);
-
static const struct nf_ct_ext_type acct_extend = {
.len = sizeof(struct nf_conn_acct),
.align = __alignof__(struct nf_conn_acct),
.id = NF_CT_EXT_ACCT,
};
-#ifdef CONFIG_SYSCTL
-static int nf_conntrack_acct_init_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table),
- GFP_KERNEL);
- if (!table)
- goto out;
-
- table[0].data = &net->ct.sysctl_acct;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
-
- net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
- table);
- if (!net->ct.acct_sysctl_header) {
- pr_err("can't register to sysctl\n");
- goto out_register;
- }
- return 0;
-
-out_register:
- kfree(table);
-out:
- return -ENOMEM;
-}
-
-static void nf_conntrack_acct_fini_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = net->ct.acct_sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.acct_sysctl_header);
- kfree(table);
-}
-#else
-static int nf_conntrack_acct_init_sysctl(struct net *net)
-{
- return 0;
-}
-
-static void nf_conntrack_acct_fini_sysctl(struct net *net)
-{
-}
-#endif
-
-int nf_conntrack_acct_pernet_init(struct net *net)
+void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
- return nf_conntrack_acct_init_sysctl(net);
-}
-
-void nf_conntrack_acct_pernet_fini(struct net *net)
-{
- nf_conntrack_acct_fini_sysctl(net);
}
int nf_conntrack_acct_init(void)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e92e749aff53..e87c21e47efe 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2110,10 +2110,7 @@ i_see_dead_people:
list_for_each_entry(net, net_exit_list, exit_list) {
nf_conntrack_proto_pernet_fini(net);
- nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
- nf_conntrack_tstamp_pernet_fini(net);
- nf_conntrack_acct_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
free_percpu(net->ct.pcpu_lists);
@@ -2410,32 +2407,19 @@ int nf_conntrack_init_net(struct net *net)
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_pernet_init(net);
- if (ret < 0)
- goto err_acct;
- ret = nf_conntrack_tstamp_pernet_init(net);
- if (ret < 0)
- goto err_tstamp;
- ret = nf_conntrack_ecache_pernet_init(net);
- if (ret < 0)
- goto err_ecache;
- ret = nf_conntrack_helper_pernet_init(net);
- if (ret < 0)
- goto err_helper;
+
+ nf_conntrack_acct_pernet_init(net);
+ nf_conntrack_tstamp_pernet_init(net);
+ nf_conntrack_ecache_pernet_init(net);
+ nf_conntrack_helper_pernet_init(net);
+
ret = nf_conntrack_proto_pernet_init(net);
if (ret < 0)
goto err_proto;
return 0;
err_proto:
- nf_conntrack_helper_pernet_fini(net);
-err_helper:
nf_conntrack_ecache_pernet_fini(net);
-err_ecache:
- nf_conntrack_tstamp_pernet_fini(net);
-err_tstamp:
- nf_conntrack_acct_pernet_fini(net);
-err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index c11822a7d2bf..3d042f8ff183 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -336,85 +336,21 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
#define NF_CT_EVENTS_DEFAULT 1
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
-#ifdef CONFIG_SYSCTL
-static struct ctl_table event_sysctl_table[] = {
- {
- .procname = "nf_conntrack_events",
- .data = &init_net.ct.sysctl_events,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {}
-};
-#endif /* CONFIG_SYSCTL */
-
static const struct nf_ct_ext_type event_extend = {
.len = sizeof(struct nf_conntrack_ecache),
.align = __alignof__(struct nf_conntrack_ecache),
.id = NF_CT_EXT_ECACHE,
};
-#ifdef CONFIG_SYSCTL
-static int nf_conntrack_event_init_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
- GFP_KERNEL);
- if (!table)
- goto out;
-
- table[0].data = &net->ct.sysctl_events;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
-
- net->ct.event_sysctl_header =
- register_net_sysctl(net, "net/netfilter", table);
- if (!net->ct.event_sysctl_header) {
- pr_err("can't register to sysctl\n");
- goto out_register;
- }
- return 0;
-
-out_register:
- kfree(table);
-out:
- return -ENOMEM;
-}
-
-static void nf_conntrack_event_fini_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = net->ct.event_sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.event_sysctl_header);
- kfree(table);
-}
-#else
-static int nf_conntrack_event_init_sysctl(struct net *net)
-{
- return 0;
-}
-
-static void nf_conntrack_event_fini_sysctl(struct net *net)
-{
-}
-#endif /* CONFIG_SYSCTL */
-
-int nf_conntrack_ecache_pernet_init(struct net *net)
+void nf_conntrack_ecache_pernet_init(struct net *net)
{
net->ct.sysctl_events = nf_ct_events;
INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work);
- return nf_conntrack_event_init_sysctl(net);
}
void nf_conntrack_ecache_pernet_fini(struct net *net)
{
cancel_delayed_work_sync(&net->ct.ecache_dwork);
- nf_conntrack_event_fini_sysctl(net);
}
int nf_conntrack_ecache_init(void)
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index e24b762ffa1d..274baf1dab87 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -42,67 +42,6 @@ module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
MODULE_PARM_DESC(nf_conntrack_helper,
"Enable automatic conntrack helper assignment (default 0)");
-#ifdef CONFIG_SYSCTL
-static struct ctl_table helper_sysctl_table[] = {
- {
- .procname = "nf_conntrack_helper",
- .data = &init_net.ct.sysctl_auto_assign_helper,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {}
-};
-
-static int nf_conntrack_helper_init_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table),
- GFP_KERNEL);
- if (!table)
- goto out;
-
- table[0].data = &net->ct.sysctl_auto_assign_helper;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
-
- net->ct.helper_sysctl_header =
- register_net_sysctl(net, "net/netfilter", table);
-
- if (!net->ct.helper_sysctl_header) {
- pr_err("nf_conntrack_helper: can't register to sysctl.\n");
- goto out_register;
- }
- return 0;
-
-out_register:
- kfree(table);
-out:
- return -ENOMEM;
-}
-
-static void nf_conntrack_helper_fini_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = net->ct.helper_sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.helper_sysctl_header);
- kfree(table);
-}
-#else
-static int nf_conntrack_helper_init_sysctl(struct net *net)
-{
- return 0;
-}
-
-static void nf_conntrack_helper_fini_sysctl(struct net *net)
-{
-}
-#endif /* CONFIG_SYSCTL */
-
/* Stupid hash, but collision free for the default registrations of the
* helpers currently in the kernel. */
static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
@@ -533,16 +472,10 @@ static const struct nf_ct_ext_type helper_extend = {
.id = NF_CT_EXT_HELPER,
};
-int nf_conntrack_helper_pernet_init(struct net *net)
+void nf_conntrack_helper_pernet_init(struct net *net)
{
net->ct.auto_assign_helper_warned = false;
net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
- return nf_conntrack_helper_init_sysctl(net);
-}
-
-void nf_conntrack_helper_pernet_fini(struct net *net)
-{
- nf_conntrack_helper_fini_sysctl(net);
}
int nf_conntrack_helper_init(void)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4ae8e528943a..1213beb5a714 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -47,7 +47,6 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l4proto.h>
#include <net/netfilter/nf_nat_helper.h>
#endif
@@ -1688,6 +1687,22 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
return 0;
}
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+static void ctnetlink_change_mark(struct nf_conn *ct,
+ const struct nlattr * const cda[])
+{
+ u32 mark, newmark, mask = 0;
+
+ if (cda[CTA_MARK_MASK])
+ mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+
+ mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ newmark = (ct->mark & mask) ^ mark;
+ if (newmark != ct->mark)
+ ct->mark = newmark;
+}
+#endif
+
static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
@@ -1883,7 +1898,7 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
- ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ ctnetlink_change_mark(ct, cda);
#endif
if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
@@ -2027,7 +2042,7 @@ ctnetlink_create_conntrack(struct net *net,
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
- ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ ctnetlink_change_mark(ct, cda);
#endif
/* setup master conntrack: this is a confirmed expectation */
@@ -2524,14 +2539,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK]) {
- u32 mask = 0, mark, newmark;
- if (cda[CTA_MARK_MASK])
- mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
-
- mark = ntohl(nla_get_be32(cda[CTA_MARK]));
- newmark = (ct->mark & mask) ^ mark;
- if (newmark != ct->mark)
- ct->mark = newmark;
+ ctnetlink_change_mark(ct, cda);
}
#endif
return 0;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 40643af7137e..859f5d07a915 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -175,8 +175,7 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
static
int nf_ct_l4proto_register_sysctl(struct net *net,
- struct nf_proto_net *pn,
- const struct nf_conntrack_l4proto *l4proto)
+ struct nf_proto_net *pn)
{
int err = 0;
@@ -198,9 +197,7 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
}
static
-void nf_ct_l4proto_unregister_sysctl(struct net *net,
- struct nf_proto_net *pn,
- const struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_unregister_sysctl(struct nf_proto_net *pn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table_header != NULL)
@@ -252,7 +249,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net,
if (pn == NULL)
goto out;
- ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto);
+ ret = nf_ct_l4proto_register_sysctl(net, pn);
if (ret < 0)
goto out;
@@ -296,7 +293,7 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
return;
pn->users--;
- nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
+ nf_ct_l4proto_unregister_sysctl(pn);
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
@@ -946,16 +943,14 @@ int nf_conntrack_proto_pernet_init(struct net *net)
if (err < 0)
return err;
err = nf_ct_l4proto_register_sysctl(net,
- pn,
- &nf_conntrack_l4proto_generic);
+ pn);
if (err < 0)
return err;
err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
ARRAY_SIZE(builtin_l4proto));
if (err < 0) {
- nf_ct_l4proto_unregister_sysctl(net, pn,
- &nf_conntrack_l4proto_generic);
+ nf_ct_l4proto_unregister_sysctl(pn);
return err;
}
@@ -971,9 +966,7 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
ARRAY_SIZE(builtin_l4proto));
pn->users--;
- nf_ct_l4proto_unregister_sysctl(net,
- pn,
- &nf_conntrack_l4proto_generic);
+ nf_ct_l4proto_unregister_sysctl(pn);
}
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 9b48dc8b4b88..8899b51aad44 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -43,24 +43,12 @@
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
-enum grep_conntrack {
- GRE_CT_UNREPLIED,
- GRE_CT_REPLIED,
- GRE_CT_MAX
-};
-
static const unsigned int gre_timeouts[GRE_CT_MAX] = {
[GRE_CT_UNREPLIED] = 30*HZ,
[GRE_CT_REPLIED] = 180*HZ,
};
static unsigned int proto_gre_net_id __read_mostly;
-struct netns_proto_gre {
- struct nf_proto_net nf;
- rwlock_t keymap_lock;
- struct list_head keymap_list;
- unsigned int gre_timeouts[GRE_CT_MAX];
-};
static inline struct netns_proto_gre *gre_pernet(struct net *net)
{
@@ -332,9 +320,49 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table gre_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_gre_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_gre_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {}
+};
+#endif
+
+static int gre_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *nf,
+ struct netns_proto_gre *net_gre)
+{
+#ifdef CONFIG_SYSCTL
+ int i;
+
+ if (nf->ctl_table)
+ return 0;
+
+ nf->ctl_table = kmemdup(gre_sysctl_table,
+ sizeof(gre_sysctl_table),
+ GFP_KERNEL);
+ if (!nf->ctl_table)
+ return -ENOMEM;
+
+ for (i = 0; i < GRE_CT_MAX; i++)
+ nf->ctl_table[i].data = &net_gre->gre_timeouts[i];
+#endif
+ return 0;
+}
+
static int gre_init_net(struct net *net)
{
struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_proto_net *nf = &net_gre->nf;
int i;
rwlock_init(&net_gre->keymap_lock);
@@ -342,7 +370,7 @@ static int gre_init_net(struct net *net)
for (i = 0; i < GRE_CT_MAX; i++)
net_gre->gre_timeouts[i] = gre_timeouts[i];
- return 0;
+ return gre_kmemdup_sysctl_table(net, nf, net_gre);
}
/* protocol helper struct */
@@ -402,6 +430,8 @@ static int __init nf_ct_proto_gre_init(void)
{
int ret;
+ BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0);
+
ret = register_pernet_subsys(&proto_gre_net_ops);
if (ret < 0)
goto out_pernet;
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index c879d8d78cfd..b4f5d5e82031 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -29,7 +29,7 @@
static const unsigned int udp_timeouts[UDP_CT_MAX] = {
[UDP_CT_UNREPLIED] = 30*HZ,
- [UDP_CT_REPLIED] = 180*HZ,
+ [UDP_CT_REPLIED] = 120*HZ,
};
static unsigned int *udp_get_timeouts(struct net *net)
@@ -100,11 +100,21 @@ static int udp_packet(struct nf_conn *ct,
if (!timeouts)
timeouts = udp_get_timeouts(nf_ct_net(ct));
+ if (!nf_ct_is_confirmed(ct))
+ ct->proto.udp.stream_ts = 2 * HZ + jiffies;
+
/* If we've seen traffic both ways, this is some kind of UDP
- stream. Extend timeout. */
+ * stream. Set Assured.
+ */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_REPLIED]);
+ unsigned long extra = timeouts[UDP_CT_UNREPLIED];
+
+ /* Still active after two seconds? Extend timeout. */
+ if (time_after(jiffies, ct->proto.udp.stream_ts))
+ extra = timeouts[UDP_CT_REPLIED];
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, extra);
+
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index a975efd6b8c3..9da303461069 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -115,12 +115,12 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb,
/* TCP SACK sequence number adjustment */
static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
unsigned int protoff,
- struct tcphdr *tcph,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- unsigned int dir, optoff, optend;
+ struct tcphdr *tcph = (void *)skb->data + protoff;
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ unsigned int dir, optoff, optend;
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
@@ -128,6 +128,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
if (!skb_make_writable(skb, optend))
return 0;
+ tcph = (void *)skb->data + protoff;
dir = CTINFO2DIR(ctinfo);
while (optoff < optend) {
@@ -207,7 +208,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
ntohl(newack));
tcph->ack_seq = newack;
- res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+ res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo);
out:
spin_unlock_bh(&ct->lock);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 463d17d349c1..b6177fd73304 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -267,6 +267,24 @@ static const char* l4proto_name(u16 proto)
return "unknown";
}
+static unsigned int
+seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
+{
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
+
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
+ return 0;
+
+ counter = acct->counter;
+ seq_printf(s, "packets=%llu bytes=%llu ",
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
+
+ return 0;
+}
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -514,36 +532,53 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
static struct ctl_table_header *nf_ct_netfilter_header;
+enum nf_ct_sysctl_index {
+ NF_SYSCTL_CT_MAX,
+ NF_SYSCTL_CT_COUNT,
+ NF_SYSCTL_CT_BUCKETS,
+ NF_SYSCTL_CT_CHECKSUM,
+ NF_SYSCTL_CT_LOG_INVALID,
+ NF_SYSCTL_CT_EXPECT_MAX,
+ NF_SYSCTL_CT_ACCT,
+ NF_SYSCTL_CT_HELPER,
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ NF_SYSCTL_CT_EVENTS,
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ NF_SYSCTL_CT_TIMESTAMP,
+#endif
+};
+
static struct ctl_table nf_ct_sysctl_table[] = {
- {
+ [NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
.data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
- {
+ [NF_SYSCTL_CT_BUCKETS] = {
.procname = "nf_conntrack_buckets",
.data = &nf_conntrack_htable_size_user,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = nf_conntrack_hash_sysctl,
},
- {
+ [NF_SYSCTL_CT_CHECKSUM] = {
.procname = "nf_conntrack_checksum",
.data = &init_net.ct.sysctl_checksum,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NF_SYSCTL_CT_LOG_INVALID] = {
.procname = "nf_conntrack_log_invalid",
.data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(unsigned int),
@@ -552,13 +587,45 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.extra1 = &log_invalid_proto_min,
.extra2 = &log_invalid_proto_max,
},
- {
+ [NF_SYSCTL_CT_EXPECT_MAX] = {
.procname = "nf_conntrack_expect_max",
.data = &nf_ct_expect_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ [NF_SYSCTL_CT_ACCT] = {
+ .procname = "nf_conntrack_acct",
+ .data = &init_net.ct.sysctl_acct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ [NF_SYSCTL_CT_HELPER] = {
+ .procname = "nf_conntrack_helper",
+ .data = &init_net.ct.sysctl_auto_assign_helper,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_SYSCTL_CT_EVENTS] = {
+ .procname = "nf_conntrack_events",
+ .data = &init_net.ct.sysctl_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_SYSCTL_CT_TIMESTAMP] = {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{ }
};
@@ -582,16 +649,28 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
if (!table)
goto out_kmemdup;
- table[1].data = &net->ct.count;
- table[3].data = &net->ct.sysctl_checksum;
- table[4].data = &net->ct.sysctl_log_invalid;
+ table[NF_SYSCTL_CT_COUNT].data = &net->ct.count;
+ table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
+#endif
/* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ if (net->user_ns != &init_user_ns) {
+ table[NF_SYSCTL_CT_MAX].procname = NULL;
+ table[NF_SYSCTL_CT_ACCT].procname = NULL;
+ table[NF_SYSCTL_CT_HELPER].procname = NULL;
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ table[NF_SYSCTL_CT_EVENTS].procname = NULL;
+#endif
+ }
if (!net_eq(&init_net, net))
- table[2].mode = 0444;
+ table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.sysctl_header)
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index 56766cb26e40..705b912bd91f 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -22,83 +22,15 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-#ifdef CONFIG_SYSCTL
-static struct ctl_table tstamp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_timestamp",
- .data = &init_net.ct.sysctl_tstamp,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {}
-};
-#endif /* CONFIG_SYSCTL */
-
static const struct nf_ct_ext_type tstamp_extend = {
.len = sizeof(struct nf_conn_tstamp),
.align = __alignof__(struct nf_conn_tstamp),
.id = NF_CT_EXT_TSTAMP,
};
-#ifdef CONFIG_SYSCTL
-static int nf_conntrack_tstamp_init_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
- GFP_KERNEL);
- if (!table)
- goto out;
-
- table[0].data = &net->ct.sysctl_tstamp;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
-
- net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter",
- table);
- if (!net->ct.tstamp_sysctl_header) {
- pr_err("can't register to sysctl\n");
- goto out_register;
- }
- return 0;
-
-out_register:
- kfree(table);
-out:
- return -ENOMEM;
-}
-
-static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
-{
- struct ctl_table *table;
-
- table = net->ct.tstamp_sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
- kfree(table);
-}
-#else
-static int nf_conntrack_tstamp_init_sysctl(struct net *net)
-{
- return 0;
-}
-
-static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
-{
-}
-#endif
-
-int nf_conntrack_tstamp_pernet_init(struct net *net)
+void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
- return nf_conntrack_tstamp_init_sysctl(net);
-}
-
-void nf_conntrack_tstamp_pernet_fini(struct net *net)
-{
- nf_conntrack_tstamp_fini_sysctl(net);
}
int nf_conntrack_tstamp_init(void)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index b7a4816add76..fa0844e2a68d 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -247,9 +247,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
-int nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
- void *data)
+static int
+nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct flow_offload *flow, void *data),
+ void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
struct rhashtable_iter hti;
@@ -279,40 +280,19 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
-EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
-static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
+static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
{
- struct flow_offload_tuple_rhash *tuplehash;
- struct rhashtable_iter hti;
- struct flow_offload *flow;
-
- rhashtable_walk_enter(&flow_table->rhashtable, &hti);
- rhashtable_walk_start(&hti);
+ struct nf_flowtable *flow_table = data;
- while ((tuplehash = rhashtable_walk_next(&hti))) {
- if (IS_ERR(tuplehash)) {
- if (PTR_ERR(tuplehash) != -EAGAIN)
- break;
- continue;
- }
- if (tuplehash->tuple.dir)
- continue;
-
- flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
-
- if (nf_flow_has_expired(flow) ||
- (flow->flags & (FLOW_OFFLOAD_DYING |
- FLOW_OFFLOAD_TEARDOWN)))
- flow_offload_del(flow_table, flow);
- }
- rhashtable_walk_stop(&hti);
- rhashtable_walk_exit(&hti);
+ if (nf_flow_has_expired(flow) ||
+ (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
+ flow_offload_del(flow_table, flow);
}
static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -320,7 +300,7 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
struct nf_flowtable *flow_table;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_offload_gc_step(flow_table);
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
@@ -504,7 +484,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_unlock(&flowtable_lock);
cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- nf_flow_offload_gc_step(flow_table);
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a8c5c846aec1..3a0d6880b7c9 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -156,22 +156,20 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
const struct net_device *out,
const struct nf_loginfo *loginfo, const char *prefix)
{
+ const struct net_device *physoutdev __maybe_unused;
+ const struct net_device *physindev __maybe_unused;
+
nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
'0' + loginfo->u.log.level, prefix,
in ? in->name : "",
out ? out->name : "");
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge) {
- const struct net_device *physindev;
- const struct net_device *physoutdev;
-
- physindev = nf_bridge_get_physindev(skb);
- if (physindev && in != physindev)
- nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = nf_bridge_get_physoutdev(skb);
- if (physoutdev && out != physoutdev)
- nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
- }
+ physindev = nf_bridge_get_physindev(skb);
+ if (physindev && in != physindev)
+ nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev && out != physoutdev)
+ nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
#endif
}
EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index e2b196054dfc..d159e9e7835b 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -23,7 +23,6 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -38,8 +37,6 @@ static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
-static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
- __read_mostly;
static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
@@ -67,13 +64,6 @@ __nf_nat_l3proto_find(u8 family)
return rcu_dereference(nf_nat_l3protos[family]);
}
-inline const struct nf_nat_l4proto *
-__nf_nat_l4proto_find(u8 family, u8 protonum)
-{
- return rcu_dereference(nf_nat_l4protos[family][protonum]);
-}
-EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
-
#ifdef CONFIG_XFRM
static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
{
@@ -117,7 +107,8 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
dst = skb_dst(skb);
if (dst->xfrm)
dst = ((struct xfrm_dst *)dst)->route;
- dst_hold(dst);
+ if (!dst_hold_safe(dst))
+ return -EHOSTUNREACH;
if (sk && !net_eq(net, sock_net(sk)))
sk = NULL;
@@ -172,27 +163,66 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
}
EXPORT_SYMBOL(nf_nat_used_tuple);
+static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range2 *range)
+{
+ if (t->src.l3num == NFPROTO_IPV4)
+ return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+ ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+
+ return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
+ ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
+}
+
+/* Is the manipable part of the tuple between min and max incl? */
+static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ __be16 port;
+
+ switch (tuple->dst.protonum) {
+ case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMPV6:
+ return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+ ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+ case IPPROTO_GRE: /* all fall though */
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_DCCP:
+ case IPPROTO_SCTP:
+ if (maniptype == NF_NAT_MANIP_SRC)
+ port = tuple->src.u.all;
+ else
+ port = tuple->dst.u.all;
+
+ return ntohs(port) >= ntohs(min->all) &&
+ ntohs(port) <= ntohs(max->all);
+ default:
+ return true;
+ }
+}
+
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range.
*/
-static int in_range(const struct nf_nat_l3proto *l3proto,
- const struct nf_nat_l4proto *l4proto,
- const struct nf_conntrack_tuple *tuple,
+static int in_range(const struct nf_conntrack_tuple *tuple,
const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
* range specified, otherwise let this drag us onto a new src IP.
*/
if (range->flags & NF_NAT_RANGE_MAP_IPS &&
- !l3proto->in_range(tuple, range))
+ !nf_nat_inet_in_range(tuple, range))
return 0;
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
- l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
- &range->min_proto, &range->max_proto))
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
return 1;
- return 0;
+ return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
+ &range->min_proto, &range->max_proto);
}
static inline int
@@ -211,8 +241,6 @@ same_src(const struct nf_conn *ct,
static int
find_appropriate_src(struct net *net,
const struct nf_conntrack_zone *zone,
- const struct nf_nat_l3proto *l3proto,
- const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range2 *range)
@@ -229,7 +257,7 @@ find_appropriate_src(struct net *net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
- if (in_range(l3proto, l4proto, result, range))
+ if (in_range(result, range))
return 1;
}
}
@@ -310,6 +338,123 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
}
}
+/* Alter the per-proto part of the tuple (depending on maniptype), to
+ * give a unique tuple in the given range if possible.
+ *
+ * Per-protocol part of tuple is initialized to the incoming packet.
+ */
+static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ unsigned int range_size, min, max, i, attempts;
+ __be16 *keyptr;
+ u16 off;
+ static const unsigned int max_attempts = 128;
+
+ switch (tuple->dst.protonum) {
+ case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMPV6:
+ /* id is same for either direction... */
+ keyptr = &tuple->src.u.icmp.id;
+ min = range->min_proto.icmp.id;
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+ goto find_free_id;
+#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
+ case IPPROTO_GRE:
+ /* If there is no master conntrack we are not PPTP,
+ do not change tuples */
+ if (!ct->master)
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.gre.key;
+ else
+ keyptr = &tuple->dst.u.gre.key;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ min = 1;
+ range_size = 65535;
+ } else {
+ min = ntohs(range->min_proto.gre.key);
+ range_size = ntohs(range->max_proto.gre.key) - min + 1;
+ }
+ goto find_free_id;
+#endif
+ case IPPROTO_UDP: /* fallthrough */
+ case IPPROTO_UDPLITE: /* fallthrough */
+ case IPPROTO_TCP: /* fallthrough */
+ case IPPROTO_SCTP: /* fallthrough */
+ case IPPROTO_DCCP: /* fallthrough */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.all;
+ else
+ keyptr = &tuple->dst.u.all;
+
+ break;
+ default:
+ return;
+ }
+
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == NF_NAT_MANIP_DST)
+ return;
+
+ if (ntohs(*keyptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*keyptr) < 512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min_proto.all);
+ max = ntohs(range->max_proto.all);
+ if (unlikely(max < min))
+ swap(max, min);
+ range_size = max - min + 1;
+ }
+
+find_free_id:
+ if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
+ off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
+ else
+ off = prandom_u32();
+
+ attempts = range_size;
+ if (attempts > max_attempts)
+ attempts = max_attempts;
+
+ /* We are in softirq; doing a search of the entire range risks
+ * soft lockup when all tuples are already used.
+ *
+ * If we can't find any free port from first offset, pick a new
+ * one and try again, with ever smaller search window.
+ */
+another_round:
+ for (i = 0; i < attempts; i++, off++) {
+ *keyptr = htons(min + off % range_size);
+ if (!nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+
+ if (attempts >= range_size || attempts < 16)
+ return;
+ attempts /= 2;
+ off = prandom_u32();
+ goto another_round;
+}
+
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
* we change the source to map into the range. For NF_INET_PRE_ROUTING
* and NF_INET_LOCAL_OUT, we change the destination to map into the
@@ -324,17 +469,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
- const struct nf_nat_l3proto *l3proto;
- const struct nf_nat_l4proto *l4proto;
struct net *net = nf_ct_net(ct);
zone = nf_ct_zone(ct);
- rcu_read_lock();
- l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
- l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
- orig_tuple->dst.protonum);
-
/* 1) If this srcip/proto/src-proto-part is currently mapped,
* and that same mapping gives a unique tuple within the given
* range, use that.
@@ -346,16 +484,16 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
- if (in_range(l3proto, l4proto, orig_tuple, range)) {
+ if (in_range(orig_tuple, range)) {
if (!nf_nat_used_tuple(orig_tuple, ct)) {
*tuple = *orig_tuple;
- goto out;
+ return;
}
- } else if (find_appropriate_src(net, zone, l3proto, l4proto,
+ } else if (find_appropriate_src(net, zone,
orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
if (!nf_nat_used_tuple(tuple, ct))
- goto out;
+ return;
}
}
@@ -371,21 +509,19 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
- l4proto->in_range(tuple, maniptype,
+ l4proto_in_range(tuple, maniptype,
&range->min_proto,
&range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
- goto out;
+ return;
} else if (!nf_nat_used_tuple(tuple, ct)) {
- goto out;
+ return;
}
}
/* Last chance: get protocol to try to obtain unique tuple. */
- l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
-out:
- rcu_read_unlock();
+ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}
struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
@@ -501,16 +637,13 @@ static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_dir dir)
{
const struct nf_nat_l3proto *l3proto;
- const struct nf_nat_l4proto *l4proto;
struct nf_conntrack_tuple target;
/* We are aiming to look like inverse of other direction. */
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
l3proto = __nf_nat_l3proto_find(target.src.l3num);
- l4proto = __nf_nat_l4proto_find(target.src.l3num,
- target.dst.protonum);
- if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ if (!l3proto->manip_pkt(skb, 0, &target, mtype))
return NF_DROP;
return NF_ACCEPT;
@@ -666,16 +799,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
-{
- struct nf_nat_proto_clean clean = {
- .l3proto = l3proto,
- .l4proto = l4proto,
- };
-
- nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
-}
-
static void nf_nat_l3proto_clean(u8 l3proto)
{
struct nf_nat_proto_clean clean = {
@@ -685,82 +808,8 @@ static void nf_nat_l3proto_clean(u8 l3proto)
nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
}
-/* Protocol registration. */
-int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
-{
- const struct nf_nat_l4proto **l4protos;
- unsigned int i;
- int ret = 0;
-
- mutex_lock(&nf_nat_proto_mutex);
- if (nf_nat_l4protos[l3proto] == NULL) {
- l4protos = kmalloc_array(IPPROTO_MAX,
- sizeof(struct nf_nat_l4proto *),
- GFP_KERNEL);
- if (l4protos == NULL) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0; i < IPPROTO_MAX; i++)
- RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
-
- /* Before making proto_array visible to lockless readers,
- * we must make sure its content is committed to memory.
- */
- smp_wmb();
-
- nf_nat_l4protos[l3proto] = l4protos;
- }
-
- if (rcu_dereference_protected(
- nf_nat_l4protos[l3proto][l4proto->l4proto],
- lockdep_is_held(&nf_nat_proto_mutex)
- ) != &nf_nat_l4proto_unknown) {
- ret = -EBUSY;
- goto out;
- }
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
- out:
- mutex_unlock(&nf_nat_proto_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
-
-/* No one stores the protocol anywhere; simply delete it. */
-void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
-{
- mutex_lock(&nf_nat_proto_mutex);
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
- &nf_nat_l4proto_unknown);
- mutex_unlock(&nf_nat_proto_mutex);
- synchronize_rcu();
-
- nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
-}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
-
int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
{
- mutex_lock(&nf_nat_proto_mutex);
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
- &nf_nat_l4proto_tcp);
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
- &nf_nat_l4proto_udp);
-#ifdef CONFIG_NF_NAT_PROTO_DCCP
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
- &nf_nat_l4proto_dccp);
-#endif
-#ifdef CONFIG_NF_NAT_PROTO_SCTP
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
- &nf_nat_l4proto_sctp);
-#endif
-#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
- RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
- &nf_nat_l4proto_udplite);
-#endif
- mutex_unlock(&nf_nat_proto_mutex);
-
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
return 0;
}
@@ -801,12 +850,26 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
[CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
};
+static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range2 *range)
+{
+ if (tb[CTA_PROTONAT_PORT_MIN]) {
+ range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
+ range->max_proto.all = range->min_proto.all;
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ if (tb[CTA_PROTONAT_PORT_MAX]) {
+ range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ return 0;
+}
+
static int nfnetlink_parse_nat_proto(struct nlattr *attr,
const struct nf_conn *ct,
struct nf_nat_range2 *range)
{
struct nlattr *tb[CTA_PROTONAT_MAX+1];
- const struct nf_nat_l4proto *l4proto;
int err;
err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr,
@@ -814,11 +877,7 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr,
if (err < 0)
return err;
- l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- if (l4proto->nlattr_to_range)
- err = l4proto->nlattr_to_range(tb, range);
-
- return err;
+ return nf_nat_l4proto_nlattr_to_range(tb, range);
}
static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
@@ -1081,7 +1140,6 @@ static int __init nf_nat_init(void)
static void __exit nf_nat_cleanup(void)
{
struct nf_nat_proto_clean clean = {};
- unsigned int i;
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
@@ -1089,10 +1147,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
- synchronize_rcu();
-
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
- kfree(nf_nat_l4protos[i]);
synchronize_net();
kvfree(nf_nat_bysource);
unregister_pernet_subsys(&nat_net_ops);
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
new file mode 100644
index 000000000000..f83bf9d8c9f5
--- /dev/null
+++ b/net/netfilter/nf_nat_proto.c
@@ -0,0 +1,343 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+
+#include <linux/dccp.h>
+#include <linux/sctp.h>
+#include <net/sctp/checksum.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static void
+__udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, struct udphdr *hdr,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype, bool do_csum)
+{
+ __be16 *portptr, newport;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+ if (do_csum) {
+ l3proto->csum_update(skb, iphdroff, &hdr->check,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+ false);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+ }
+ *portptr = newport;
+}
+
+static bool udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ bool do_csum;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+ do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+
+ __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
+ return true;
+}
+
+static bool udplite_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ struct udphdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+ __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
+#endif
+ return true;
+}
+
+static bool
+sctp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ struct sctphdr *hdr;
+ int hdrsize = 8;
+
+ /* This could be an inner header returned in imcp packet; in such
+ * cases we cannot update the checksum field since it is outside
+ * of the 8 bytes of transport layer headers we are guaranteed.
+ */
+ if (skb->len >= hdroff + sizeof(*hdr))
+ hdrsize = sizeof(*hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct sctphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ hdr->source = tuple->src.u.sctp.port;
+ } else {
+ /* Get rid of dst port */
+ hdr->dest = tuple->dst.u.sctp.port;
+ }
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
+#endif
+ return true;
+}
+
+static bool
+tcp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct tcphdr *hdr;
+ __be16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if (skb->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct tcphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
+ return true;
+}
+
+static bool
+dccp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ struct dccp_hdr *hdr;
+ __be16 *portptr, oldport, newport;
+ int hdrsize = 8; /* DCCP connection tracking guarantees this much */
+
+ if (skb->len >= hdroff + sizeof(struct dccp_hdr))
+ hdrsize = sizeof(struct dccp_hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct dccp_hdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ newport = tuple->src.u.dccp.port;
+ portptr = &hdr->dccph_sport;
+ } else {
+ newport = tuple->dst.u.dccp.port;
+ portptr = &hdr->dccph_dport;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
+ false);
+#endif
+ return true;
+}
+
+static bool
+icmp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmphdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmphdr *)(skb->data + hdroff);
+ inet_proto_csum_replace2(&hdr->checksum, skb,
+ hdr->un.echo.id, tuple->src.u.icmp.id, false);
+ hdr->un.echo.id = tuple->src.u.icmp.id;
+ return true;
+}
+
+static bool
+icmpv6_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmp6hdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmp6hdr *)(skb->data + hdroff);
+ l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
+ tuple, maniptype);
+ if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
+ hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
+ inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
+ hdr->icmp6_identifier,
+ tuple->src.u.icmp.id, false);
+ hdr->icmp6_identifier = tuple->src.u.icmp.id;
+ }
+ return true;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static bool
+gre_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
+ const struct gre_base_hdr *greh;
+ struct pptp_gre_header *pgreh;
+
+ /* pgreh includes two optional 32bit fields which are not required
+ * to be there. That's where the magic '8' comes from */
+ if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ return false;
+
+ greh = (void *)skb->data + hdroff;
+ pgreh = (struct pptp_gre_header *)greh;
+
+ /* we only have destination manip of a packet, since 'source key'
+ * is not present in the packet itself */
+ if (maniptype != NF_NAT_MANIP_DST)
+ return true;
+
+ switch (greh->flags & GRE_VERSION) {
+ case GRE_VERSION_0:
+ /* We do not currently NAT any GREv0 packets.
+ * Try to behave like "nf_nat_proto_unknown" */
+ break;
+ case GRE_VERSION_1:
+ pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
+ pgreh->call_id = tuple->dst.u.gre.key;
+ break;
+ default:
+ pr_debug("can't nat unknown GRE version\n");
+ return false;
+ }
+#endif
+ return true;
+}
+
+bool nf_nat_l4proto_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ switch (tuple->dst.protonum) {
+ case IPPROTO_TCP:
+ return tcp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_UDP:
+ return udp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_UDPLITE:
+ return udplite_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_SCTP:
+ return sctp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_ICMP:
+ return icmp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_ICMPV6:
+ return icmpv6_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_DCCP:
+ return dccp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ case IPPROTO_GRE:
+ return gre_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ tuple, maniptype);
+ }
+
+ /* If we don't know protocol -- no error, pass it unmodified. */
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_manip_pkt);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 5d849d835561..000000000000
--- a/net/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/random.h>
-#include <linux/netfilter.h>
-#include <linux/export.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- __be16 port;
-
- if (maniptype == NF_NAT_MANIP_SRC)
- port = tuple->src.u.all;
- else
- port = tuple->dst.u.all;
-
- return ntohs(port) >= ntohs(min->all) &&
- ntohs(port) <= ntohs(max->all);
-}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
-
-void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct,
- u16 *rover)
-{
- unsigned int range_size, min, max, i;
- __be16 *portptr;
- u_int16_t off;
-
- if (maniptype == NF_NAT_MANIP_SRC)
- portptr = &tuple->src.u.all;
- else
- portptr = &tuple->dst.u.all;
-
- /* If no range specified... */
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
- /* If it's dst rewrite, can't change port */
- if (maniptype == NF_NAT_MANIP_DST)
- return;
-
- if (ntohs(*portptr) < 1024) {
- /* Loose convention: >> 512 is credential passing */
- if (ntohs(*portptr) < 512) {
- min = 1;
- range_size = 511 - min + 1;
- } else {
- min = 600;
- range_size = 1023 - min + 1;
- }
- } else {
- min = 1024;
- range_size = 65535 - 1024 + 1;
- }
- } else {
- min = ntohs(range->min_proto.all);
- max = ntohs(range->max_proto.all);
- if (unlikely(max < min))
- swap(max, min);
- range_size = max - min + 1;
- }
-
- if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
- off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
- ? tuple->dst.u.all
- : tuple->src.u.all);
- } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
- off = prandom_u32();
- } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
- off = (ntohs(*portptr) - ntohs(range->base_proto.all));
- } else {
- off = *rover;
- }
-
- for (i = 0; ; ++off) {
- *portptr = htons(min + off % range_size);
- if (++i != range_size && nf_nat_used_tuple(tuple, ct))
- continue;
- if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
- NF_NAT_RANGE_PROTO_OFFSET)))
- *rover = off;
- return;
- }
-}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range2 *range)
-{
- if (tb[CTA_PROTONAT_PORT_MIN]) {
- range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
- range->max_proto.all = range->min_proto.all;
- range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
- if (tb[CTA_PROTONAT_PORT_MAX]) {
- range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
- range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range);
-#endif
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 67ea0d83aa5a..000000000000
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * DCCP NAT protocol helper
- *
- * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/dccp.h>
-
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u_int16_t dccp_port_rover;
-
-static void
-dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &dccp_port_rover);
-}
-
-static bool
-dccp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct dccp_hdr *hdr;
- __be16 *portptr, oldport, newport;
- int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
- if (skb->len >= hdroff + sizeof(struct dccp_hdr))
- hdrsize = sizeof(struct dccp_hdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- newport = tuple->src.u.dccp.port;
- portptr = &hdr->dccph_sport;
- } else {
- newport = tuple->dst.u.dccp.port;
- portptr = &hdr->dccph_dport;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
- tuple, maniptype);
- inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- false);
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
- .l4proto = IPPROTO_DCCP,
- .manip_pkt = dccp_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = dccp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 1c5d9b65fbba..000000000000
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/sctp.h>
-#include <net/sctp/checksum.h>
-
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u_int16_t nf_sctp_port_rover;
-
-static void
-sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &nf_sctp_port_rover);
-}
-
-static bool
-sctp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct sctphdr *hdr;
- int hdrsize = 8;
-
- /* This could be an inner header returned in imcp packet; in such
- * cases we cannot update the checksum field since it is outside
- * of the 8 bytes of transport layer headers we are guaranteed.
- */
- if (skb->len >= hdroff + sizeof(*hdr))
- hdrsize = sizeof(*hdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct sctphdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- /* Get rid of src port */
- hdr->source = tuple->src.u.sctp.port;
- } else {
- /* Get rid of dst port */
- hdr->dest = tuple->dst.u.sctp.port;
- }
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- hdr->checksum = sctp_compute_cksum(skb, hdroff);
- skb->ip_summed = CHECKSUM_NONE;
- }
-
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
- .l4proto = IPPROTO_SCTP,
- .manip_pkt = sctp_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = sctp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index f15fcd475f98..000000000000
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/tcp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-#include <net/netfilter/nf_nat_core.h>
-
-static u16 tcp_port_rover;
-
-static void
-tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &tcp_port_rover);
-}
-
-static bool
-tcp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct tcphdr *hdr;
- __be16 *portptr, newport, oldport;
- int hdrsize = 8; /* TCP connection tracking guarantees this much */
-
- /* this could be a inner header returned in icmp packet; in such
- cases we cannot update the checksum field since it is outside of
- the 8 bytes of transport layer headers we are guaranteed */
- if (skb->len >= hdroff + sizeof(struct tcphdr))
- hdrsize = sizeof(struct tcphdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct tcphdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- /* Get rid of src port */
- newport = tuple->src.u.tcp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst port */
- newport = tuple->dst.u.tcp.port;
- portptr = &hdr->dest;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_tcp = {
- .l4proto = IPPROTO_TCP,
- .manip_pkt = tcp_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = tcp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index 5790f70a83b2..000000000000
--- a/net/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u16 udp_port_rover;
-
-static void
-udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &udp_port_rover);
-}
-
-static void
-__udp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, struct udphdr *hdr,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype, bool do_csum)
-{
- __be16 *portptr, newport;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- /* Get rid of src port */
- newport = tuple->src.u.udp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst port */
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
- if (do_csum) {
- l3proto->csum_update(skb, iphdroff, &hdr->check,
- tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
- false);
- if (!hdr->check)
- hdr->check = CSUM_MANGLED_0;
- }
- *portptr = newport;
-}
-
-static bool udp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct udphdr *hdr;
- bool do_csum;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct udphdr *)(skb->data + hdroff);
- do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
-
- __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
- return true;
-}
-
-#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
-static u16 udplite_port_rover;
-
-static bool udplite_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct udphdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct udphdr *)(skb->data + hdroff);
- __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
- return true;
-}
-
-static void
-udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &udplite_port_rover);
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
- .l4proto = IPPROTO_UDPLITE,
- .manip_pkt = udplite_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = udplite_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
-
-const struct nf_nat_l4proto nf_nat_l4proto_udp = {
- .l4proto = IPPROTO_UDP,
- .manip_pkt = udp_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = udp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index c5db3e251232..000000000000
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* The "unknown" protocol. This is what is used for protocols we
- * don't understand. It's returned by ip_ct_find_proto().
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type manip_type,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return true;
-}
-
-static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- /* Sorry: we can't help you; if it's not unique, we can't frob
- * anything.
- */
- return;
-}
-
-static bool
-unknown_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_unknown = {
- .manip_pkt = unknown_manip_pkt,
- .in_range = unknown_in_range,
- .unique_tuple = unknown_unique_tuple,
-};
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 1f3086074981..aa1be643d7a0 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -18,6 +18,7 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -316,6 +317,9 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
static void nf_nat_sip_expected(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ struct nf_conn_help *help = nfct_help(ct->master);
+ struct nf_conntrack_expect *pair_exp;
+ int range_set_for_snat = 0;
struct nf_nat_range2 range;
/* This must be a fresh one. */
@@ -327,15 +331,42 @@ static void nf_nat_sip_expected(struct nf_conn *ct,
range.min_addr = range.max_addr = exp->saved_addr;
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
- /* Change src to where master sends to, but only if the connection
- * actually came from the same source. */
- if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+ /* Do media streams SRC manip according with the parameters
+ * found in the paired expectation.
+ */
+ if (exp->class != SIP_EXPECT_SIGNALLING) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry(pair_exp, &help->expectations, lnode) {
+ if (pair_exp->tuple.src.l3num == nf_ct_l3num(ct) &&
+ pair_exp->tuple.dst.protonum == ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum &&
+ nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &pair_exp->saved_addr) &&
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all == pair_exp->saved_proto.all) {
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto.all = range.max_proto.all = pair_exp->tuple.dst.u.all;
+ range.min_addr = range.max_addr = pair_exp->tuple.dst.u3;
+ range_set_for_snat = 1;
+ break;
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ }
+
+ /* When no paired expectation has been found, change src to
+ * where master sends to, but only if the connection actually came
+ * from the same source.
+ */
+ if (!range_set_for_snat &&
+ nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
&ct->master->tuplehash[exp->dir].tuple.src.u3)) {
range.flags = NF_NAT_RANGE_MAP_IPS;
range.min_addr = range.max_addr
= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
- nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ range_set_for_snat = 1;
}
+
+ /* Perform SRC manip. */
+ if (range_set_for_snat)
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d67a96a25a68..a36a77bae1d6 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -46,6 +46,24 @@ void nf_unregister_queue_handler(struct net *net)
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
+static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge) {
+ struct net_device *physdev;
+
+ physdev = nf_bridge_get_physindev(skb);
+ if (physdev)
+ dev_put(physdev);
+ physdev = nf_bridge_get_physoutdev(skb);
+ if (physdev)
+ dev_put(physdev);
+ }
+#endif
+}
+
void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
@@ -57,20 +75,28 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
dev_put(state->out);
if (state->sk)
sock_put(state->sk);
+
+ nf_queue_entry_release_br_nf_refs(entry->skb);
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+
+static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb)
+{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (entry->skb->nf_bridge) {
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge) {
struct net_device *physdev;
- physdev = nf_bridge_get_physindev(entry->skb);
+ physdev = nf_bridge_get_physindev(skb);
if (physdev)
- dev_put(physdev);
- physdev = nf_bridge_get_physoutdev(entry->skb);
+ dev_hold(physdev);
+ physdev = nf_bridge_get_physoutdev(skb);
if (physdev)
- dev_put(physdev);
+ dev_hold(physdev);
}
#endif
}
-EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
/* Bump dev refs so they don't vanish while packet is out */
void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
@@ -83,18 +109,8 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
dev_hold(state->out);
if (state->sk)
sock_hold(state->sk);
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (entry->skb->nf_bridge) {
- struct net_device *physdev;
- physdev = nf_bridge_get_physindev(entry->skb);
- if (physdev)
- dev_hold(physdev);
- physdev = nf_bridge_get_physoutdev(entry->skb);
- if (physdev)
- dev_hold(physdev);
- }
-#endif
+ nf_queue_entry_get_br_nf_refs(entry->skb);
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 42487d01a3ed..fec814dace5a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1216,7 +1216,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
goto nla_put_failure;
- if (basechain->stats && nft_dump_stats(skb, basechain->stats))
+ if (rcu_access_pointer(basechain->stats) &&
+ nft_dump_stats(skb, rcu_dereference(basechain->stats)))
goto nla_put_failure;
}
@@ -1392,7 +1393,8 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct nft_base_chain *chain,
+static void nft_chain_stats_replace(struct net *net,
+ struct nft_base_chain *chain,
struct nft_stats __percpu *newstats)
{
struct nft_stats __percpu *oldstats;
@@ -1400,8 +1402,9 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
if (newstats == NULL)
return;
- if (chain->stats) {
- oldstats = nfnl_dereference(chain->stats, NFNL_SUBSYS_NFTABLES);
+ if (rcu_access_pointer(chain->stats)) {
+ oldstats = rcu_dereference_protected(chain->stats,
+ lockdep_commit_lock_is_held(net));
rcu_assign_pointer(chain->stats, newstats);
synchronize_rcu();
free_percpu(oldstats);
@@ -1439,9 +1442,10 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
struct nft_base_chain *basechain = nft_base_chain(chain);
module_put(basechain->type->owner);
- free_percpu(basechain->stats);
- if (basechain->stats)
+ if (rcu_access_pointer(basechain->stats)) {
static_branch_dec(&nft_counters_enabled);
+ free_percpu(rcu_dereference_raw(basechain->stats));
+ }
kfree(chain->name);
kfree(basechain);
} else {
@@ -1590,7 +1594,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
kfree(basechain);
return PTR_ERR(stats);
}
- basechain->stats = stats;
+ rcu_assign_pointer(basechain->stats, stats);
static_branch_inc(&nft_counters_enabled);
}
@@ -2291,15 +2295,52 @@ struct nft_rule_dump_ctx {
char *chain;
};
+static int __nf_tables_dump_rules(struct sk_buff *skb,
+ unsigned int *idx,
+ struct netlink_callback *cb,
+ const struct nft_table *table,
+ const struct nft_chain *chain)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int s_idx = cb->args[0];
+ const struct nft_rule *rule;
+ int rc = 1;
+
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
+ if (!nft_is_active(net, rule))
+ goto cont;
+ if (*idx < s_idx)
+ goto cont;
+ if (*idx > s_idx) {
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ }
+ if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWRULE,
+ NLM_F_MULTI | NLM_F_APPEND,
+ table->family,
+ table, chain, rule) < 0)
+ goto out_unfinished;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ (*idx)++;
+ }
+ rc = 0;
+out_unfinished:
+ cb->args[0] = *idx;
+ return rc;
+}
+
static int nf_tables_dump_rules(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
const struct nft_rule_dump_ctx *ctx = cb->data;
- const struct nft_table *table;
+ struct nft_table *table;
const struct nft_chain *chain;
- const struct nft_rule *rule;
- unsigned int idx = 0, s_idx = cb->args[0];
+ unsigned int idx = 0;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
@@ -2313,37 +2354,34 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
continue;
- list_for_each_entry_rcu(chain, &table->chains, list) {
- if (ctx && ctx->chain &&
- strcmp(ctx->chain, chain->name) != 0)
- continue;
+ if (ctx && ctx->chain) {
+ struct rhlist_head *list, *tmp;
- list_for_each_entry_rcu(rule, &chain->rules, list) {
- if (!nft_is_active(net, rule))
- goto cont;
- if (idx < s_idx)
- goto cont;
- if (idx > s_idx)
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFT_MSG_NEWRULE,
- NLM_F_MULTI | NLM_F_APPEND,
- table->family,
- table, chain, rule) < 0)
- goto done;
-
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
+ list = rhltable_lookup(&table->chains_ht, ctx->chain,
+ nft_chain_ht_params);
+ if (!list)
+ goto done;
+
+ rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
+ if (!nft_is_active(net, chain))
+ continue;
+ __nf_tables_dump_rules(skb, &idx,
+ cb, table, chain);
+ break;
}
+ goto done;
}
+
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+ goto done;
+ }
+
+ if (ctx && ctx->table)
+ break;
}
done:
rcu_read_unlock();
-
- cb->args[0] = idx;
return skb->len;
}
@@ -2457,7 +2495,7 @@ err:
static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
struct nft_rule *rule)
{
- struct nft_expr *expr;
+ struct nft_expr *expr, *next;
/*
* Careful: some expressions might not be initialized in case this
@@ -2465,8 +2503,9 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
*/
expr = nft_expr_first(rule);
while (expr != nft_expr_last(rule) && expr->ops) {
+ next = nft_expr_next(expr);
nf_tables_expr_destroy(ctx, expr);
- expr = nft_expr_next(expr);
+ expr = next;
}
kfree(rule);
}
@@ -2589,17 +2628,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
if (chain->use == UINT_MAX)
return -EOVERFLOW;
- }
-
- if (nla[NFTA_RULE_POSITION]) {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
- return -EOPNOTSUPP;
- pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nft_rule_lookup(chain, pos_handle);
- if (IS_ERR(old_rule)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
- return PTR_ERR(old_rule);
+ if (nla[NFTA_RULE_POSITION]) {
+ pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
+ old_rule = __nft_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
+ return PTR_ERR(old_rule);
+ }
}
}
@@ -2669,21 +2705,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
- if (!nft_is_active_next(net, old_rule)) {
- err = -ENOENT;
- goto err2;
- }
- trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
- old_rule);
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
goto err2;
}
- nft_deactivate_next(net, old_rule);
- chain->use--;
-
- if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
- err = -ENOMEM;
+ err = nft_delrule(&ctx, old_rule);
+ if (err < 0) {
+ nft_trans_destroy(trans);
goto err2;
}
@@ -6189,7 +6218,8 @@ static void nft_chain_commit_update(struct nft_trans *trans)
return;
basechain = nft_base_chain(trans->ctx.chain);
- nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans));
+ nft_chain_stats_replace(trans->ctx.net, basechain,
+ nft_trans_chain_stats(trans));
switch (nft_trans_chain_policy(trans)) {
case NF_DROP:
@@ -6324,7 +6354,7 @@ static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
}
-static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
struct nft_rule **g0, **g1;
bool next_genbit;
@@ -6441,11 +6471,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
/* step 2. Make rules_gen_X visible to packet path */
list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(chain, &table->chains, list) {
- if (!nft_is_active_next(net, chain))
- continue;
- nf_tables_commit_chain_active(net, chain);
- }
+ list_for_each_entry(chain, &table->chains, list)
+ nf_tables_commit_chain(net, chain);
}
/*
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 3fbce3b9c5ec..a50500232b0a 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -101,7 +101,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
struct nft_stats *stats;
base_chain = nft_base_chain(chain);
- if (!base_chain->stats)
+ if (!rcu_access_pointer(base_chain->stats))
return;
local_bh_disable();
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index a518eb162344..109b0d27345a 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -455,7 +455,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
case IPPROTO_TCP:
timeouts = nf_tcp_pernet(net)->timeouts;
break;
- case IPPROTO_UDP:
+ case IPPROTO_UDP: /* fallthrough */
+ case IPPROTO_UDPLITE:
timeouts = nf_udp_pernet(net)->timeouts;
break;
case IPPROTO_DCCP:
@@ -471,11 +472,21 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
timeouts = nf_sctp_pernet(net)->timeouts;
#endif
break;
+ case IPPROTO_GRE:
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ if (l4proto->net_id) {
+ struct netns_proto_gre *net_gre;
+
+ net_gre = net_generic(net, *l4proto->net_id);
+ timeouts = net_gre->gre_timeouts;
+ }
+#endif
+ break;
case 255:
timeouts = &nf_generic_pernet(net)->timeout;
break;
default:
- WARN_ON_ONCE(1);
+ WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto);
break;
}
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 332c69d27b47..b1f9c5303f02 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -148,7 +148,7 @@ static void
instance_put(struct nfulnl_instance *inst)
{
if (inst && refcount_dec_and_test(&inst->use))
- call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
+ call_rcu(&inst->rcu, nfulnl_instance_free_rcu);
}
static void nfulnl_timer(struct timer_list *t);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 43041f087eb3..0dcc3592d053 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -727,13 +727,13 @@ nf_queue_entry_dup(struct nf_queue_entry *e)
*/
static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
{
- if (skb->nf_bridge)
+ if (nf_bridge_info_get(skb))
__skb_push(skb, skb->network_header - skb->mac_header);
}
static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
{
- if (skb->nf_bridge)
+ if (nf_bridge_info_get(skb))
__skb_pull(skb, skb->network_header - skb->mac_header);
}
#else
@@ -904,23 +904,22 @@ nfqnl_set_mode(struct nfqnl_instance *queue,
static int
dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
{
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ int physinif, physoutif;
+
+ physinif = nf_bridge_get_physinif(entry->skb);
+ physoutif = nf_bridge_get_physoutif(entry->skb);
+
+ if (physinif == ifindex || physoutif == ifindex)
+ return 1;
+#endif
if (entry->state.in)
if (entry->state.in->ifindex == ifindex)
return 1;
if (entry->state.out)
if (entry->state.out->ifindex == ifindex)
return 1;
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (entry->skb->nf_bridge) {
- int physinif, physoutif;
- physinif = nf_bridge_get_physinif(entry->skb);
- physoutif = nf_bridge_get_physoutif(entry->skb);
-
- if (physinif == ifindex || physoutif == ifindex)
- return 1;
- }
-#endif
return 0;
}
@@ -1148,8 +1147,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
return -EINVAL;
- entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]));
- entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]);
+ __vlan_hwaccel_put_tag(entry->skb,
+ nla_get_be16(tb[NFQA_VLAN_PROTO]),
+ ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])));
}
if (nfqa[NFQA_L2HDR]) {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 9d0ede474224..7334e0b80a5e 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -520,6 +520,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
void *info)
{
struct xt_match *match = expr->ops->data;
+ struct module *me = match->me;
struct xt_mtdtor_param par;
par.net = ctx->net;
@@ -530,7 +531,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
par.match->destroy(&par);
if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
- module_put(match->me);
+ module_put(me);
}
static void
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index e82d9a966c45..974525eb92df 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -214,7 +214,9 @@ static int __init nft_flow_offload_module_init(void)
{
int err;
- register_netdevice_notifier(&flow_offload_netdev_notifier);
+ err = register_netdevice_notifier(&flow_offload_netdev_notifier);
+ if (err)
+ goto err;
err = nft_register_expr(&nft_flow_offload_type);
if (err < 0)
@@ -224,6 +226,7 @@ static int __init nft_flow_offload_module_init(void)
register_expr:
unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+err:
return err;
}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6180626c3f80..6df486c5ebd3 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -229,7 +229,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
}
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
- nft_reg_store8(dest, !!skb->sp);
+ nft_reg_store8(dest, secpath_exists(skb));
break;
#endif
#ifdef CONFIG_NF_TABLES_BRIDGE
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 5322609f7662..b08865ec5ed3 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -161,7 +161,7 @@ static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- const struct sec_path *sp = pkt->skb->sp;
+ const struct sec_path *sp = skb_sec_path(pkt->skb);
const struct xfrm_state *state;
if (sp == NULL || sp->len <= priv->spnum) {
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dec843cadf46..9e05c86ba5c4 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -201,18 +201,8 @@ static __net_init int xt_rateest_net_init(struct net *net)
return 0;
}
-static void __net_exit xt_rateest_net_exit(struct net *net)
-{
- struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
- int i;
-
- for (i = 0; i < ARRAY_SIZE(xn->hash); i++)
- WARN_ON_ONCE(!hlist_empty(&xn->hash[i]));
-}
-
static struct pernet_operations xt_rateest_net_ops = {
.init = xt_rateest_net_init,
- .exit = xt_rateest_net_exit,
.id = &xt_rateest_id,
.size = sizeof(struct xt_rateest_net),
};
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 3e7d259e5d8d..28e27a32d9b9 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -260,7 +260,7 @@ static inline void
dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
{
hlist_del_rcu(&ent->node);
- call_rcu_bh(&ent->rcu, dsthash_free_rcu);
+ call_rcu(&ent->rcu, dsthash_free_rcu);
ht->count--;
}
static void htable_gc(struct work_struct *work);
@@ -295,9 +295,10 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
/* copy match config into hashtable config */
ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
-
- if (ret)
+ if (ret) {
+ vfree(hinfo);
return ret;
+ }
hinfo->cfg.size = size;
if (hinfo->cfg.max == 0)
@@ -814,7 +815,6 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
int ret;
ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
-
if (ret)
return ret;
@@ -830,7 +830,6 @@ hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
int ret;
ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
-
if (ret)
return ret;
@@ -921,7 +920,6 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
return ret;
ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
-
if (ret)
return ret;
@@ -940,7 +938,6 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
return ret;
ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
-
if (ret)
return ret;
@@ -1329,7 +1326,7 @@ static void __exit hashlimit_mt_exit(void)
xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
unregister_pernet_subsys(&hashlimit_net_ops);
- rcu_barrier_bh();
+ rcu_barrier();
kmem_cache_destroy(hashlimit_cachep);
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 9d6d67b953ac..4034d70bff39 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -33,7 +33,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* Not a bridged IP packet or no info available yet:
* LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
* the destination device will be a bridge. */
- if (!skb->nf_bridge) {
+ if (!nf_bridge_info_exists(skb)) {
/* Return MATCH if the invert flags of the used options are on */
if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
!(info->invert & XT_PHYSDEV_OP_BRIDGED))
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 13f8ccf946d6..aa84e8121c93 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -56,7 +56,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info,
unsigned short family)
{
const struct xt_policy_elem *e;
- const struct sec_path *sp = skb->sp;
+ const struct sec_path *sp = skb_sec_path(skb);
int strict = info->flags & XT_POLICY_MATCH_STRICT;
int i, pos;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6bb9f3cde0b0..3c023d6120f6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1706,7 +1706,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
nlk->flags &= ~NETLINK_F_EXT_ACK;
err = 0;
break;
- case NETLINK_DUMP_STRICT_CHK:
+ case NETLINK_GET_STRICT_CHK:
if (val)
nlk->flags |= NETLINK_F_STRICT_CHK;
else
@@ -1806,7 +1806,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
err = 0;
break;
- case NETLINK_DUMP_STRICT_CHK:
+ case NETLINK_GET_STRICT_CHK:
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 85ae53d8fd09..e47ebbbe71b8 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -301,7 +301,7 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
key->eth.vlan.tpid = vlan->vlan_tpid;
}
return skb_vlan_push(skb, vlan->vlan_tpid,
- ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+ ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK);
}
/* 'src' is already properly masked. */
@@ -822,8 +822,10 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
__skb_dst_copy(skb, data->dst);
*OVS_CB(skb) = data->cb;
skb->inner_protocol = data->inner_protocol;
- skb->vlan_tci = data->vlan_tci;
- skb->vlan_proto = data->vlan_proto;
+ if (data->vlan_tci & VLAN_CFI_MASK)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK);
+ else
+ __vlan_hwaccel_clear_tag(skb);
/* Reconstruct the MAC header. */
skb_push(skb, data->l2_len);
@@ -867,7 +869,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
data->network_offset = orig_network_offset;
- data->vlan_tci = skb->vlan_tci;
+ if (skb_vlan_tag_present(skb))
+ data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK;
+ else
+ data->vlan_tci = 0;
data->vlan_proto = skb->vlan_proto;
data->mac_proto = mac_proto;
data->l2_len = hlen;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a4660c48ff01..cd94f925495a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1166,7 +1166,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
if (err) {
net_warn_ratelimited("openvswitch: zone: %u "
- "execeeds conntrack limit\n",
+ "exceeds conntrack limit\n",
info->zone.id);
return err;
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 35966da84769..57e07768c9d1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -325,7 +325,7 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
return -ENOMEM;
vh = (struct vlan_head *)skb->data;
- key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
+ key_vh->tci = vh->tci | htons(VLAN_CFI_MASK);
key_vh->tpid = vh->tpid;
if (unlikely(untag_vlan)) {
@@ -358,7 +358,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
int res;
if (skb_vlan_tag_present(skb)) {
- key->eth.vlan.tci = htons(skb->vlan_tci);
+ key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK);
key->eth.vlan.tpid = skb->vlan_proto;
} else {
/* Parse outer vlan tag in the non-accelerated case. */
@@ -597,7 +597,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
* skb_vlan_pop(), which will later shift the ethertype into
* skb->protocol.
*/
- if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
+ if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
skb->protocol = key->eth.cvlan.tpid;
else
skb->protocol = key->eth.type;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8b7..ba01fc4270bd 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -60,7 +60,7 @@ struct ovs_tunnel_info {
struct vlan_head {
__be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
- __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */
};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 865ecef68196..435a4bdf8f89 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -990,9 +990,9 @@ static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
if (a[OVS_KEY_ATTR_VLAN])
tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (!(tci & htons(VLAN_CFI_MASK))) {
if (tci) {
- OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
+ OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.",
(inner) ? "C-VLAN" : "VLAN");
return -EINVAL;
} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
@@ -1013,9 +1013,9 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
__be16 tci = 0;
__be16 tpid = 0;
bool encap_valid = !!(match->key->eth.vlan.tci &
- htons(VLAN_TAG_PRESENT));
+ htons(VLAN_CFI_MASK));
bool i_encap_valid = !!(match->key->eth.cvlan.tci &
- htons(VLAN_TAG_PRESENT));
+ htons(VLAN_CFI_MASK));
if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
/* Not a VLAN. */
@@ -1039,8 +1039,8 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
(inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
return -EINVAL;
}
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
+ if (!(tci & htons(VLAN_CFI_MASK))) {
+ OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.",
(inner) ? "C-VLAN" : "VLAN");
return -EINVAL;
}
@@ -1095,7 +1095,7 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
if (err)
return err;
- encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
+ encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK));
if (encap_valid) {
err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
is_mask, log);
@@ -2943,7 +2943,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
vlan = nla_data(a);
if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK)))
return -EINVAL;
vlan_tci = vlan->vlan_tci;
break;
@@ -2959,7 +2959,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
/* Prohibit push MPLS other than to a white list
* for packets that have a known tag order.
*/
- if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ if (vlan_tci & htons(VLAN_CFI_MASK) ||
(eth_type != htons(ETH_P_IP) &&
eth_type != htons(ETH_P_IPV6) &&
eth_type != htons(ETH_P_ARP) &&
@@ -2971,7 +2971,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
}
case OVS_ACTION_ATTR_POP_MPLS:
- if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ if (vlan_tci & htons(VLAN_CFI_MASK) ||
!eth_p_mpls(eth_type))
return -EINVAL;
@@ -3036,7 +3036,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_POP_ETH:
if (mac_proto != MAC_PROTO_ETHERNET)
return -EINVAL;
- if (vlan_tci & htons(VLAN_TAG_PRESENT))
+ if (vlan_tci & htons(VLAN_CFI_MASK))
return -EINVAL;
mac_proto = MAC_PROTO_NONE;
break;
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 5aaf3babfc3f..acb6077b7478 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -93,7 +93,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- err = dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
rtnl_delete_link(dev);
rtnl_unlock();
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 0e72d95b0e8f..c38a62464b85 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -68,7 +68,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- err = dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
rtnl_delete_link(dev);
rtnl_unlock();
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2e5e7a41d8ef..9bec22e3e9e8 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp)
struct vport *local;
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
- BUG_ON(!local);
return local->dev;
}
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 7e6301b2ec4d..8f16f11f7ad3 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -131,7 +131,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- err = dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
rtnl_delete_link(dev);
rtnl_unlock();
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a74650e98f42..eedacdebcd4c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1965,7 +1965,7 @@ retry:
skb->mark = sk->sk_mark;
skb->tstamp = sockc.transmit_time;
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
@@ -2460,7 +2460,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
skb->tstamp = sockc->transmit_time;
- sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
skb_zcopy_set_nouarg(skb, ph.raw);
skb_reserve(skb, hlen);
@@ -2625,8 +2625,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
sll_addr)))
goto out;
proto = saddr->sll_protocol;
- addr = saddr->sll_addr;
+ addr = saddr->sll_halen ? saddr->sll_addr : NULL;
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
+ if (addr && dev && saddr->sll_halen < dev->addr_len)
+ goto out;
}
err = -ENXIO;
@@ -2823,8 +2825,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
goto out;
proto = saddr->sll_protocol;
- addr = saddr->sll_addr;
+ addr = saddr->sll_halen ? saddr->sll_addr : NULL;
dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
+ if (addr && dev && saddr->sll_halen < dev->addr_len)
+ goto out;
}
err = -ENXIO;
@@ -2898,7 +2902,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
goto out_free;
}
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
!packet_extra_vlan_len_allowed(dev, skb)) {
diff --git a/net/rds/message.c b/net/rds/message.c
index 4b00b1152a5f..f139420ba1f6 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -308,16 +308,27 @@ out:
/*
* RDS ops use this to grab SG entries from the rm's sg pool.
*/
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents,
+ int *ret)
{
struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
struct scatterlist *sg_ret;
- WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
- WARN_ON(!nents);
+ if (WARN_ON(!ret))
+ return NULL;
- if (rm->m_used_sgs + nents > rm->m_total_sgs)
+ if (nents <= 0) {
+ pr_warn("rds: alloc sgs failed! nents <= 0\n");
+ *ret = -EINVAL;
return NULL;
+ }
+
+ if (rm->m_used_sgs + nents > rm->m_total_sgs) {
+ pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n",
+ rm->m_total_sgs, rm->m_used_sgs, nents);
+ *ret = -ENOMEM;
+ return NULL;
+ }
sg_ret = &sg_first[rm->m_used_sgs];
sg_init_table(sg_ret, nents);
@@ -332,6 +343,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
unsigned int i;
int num_sgs = ceil(total_len, PAGE_SIZE);
int extra_bytes = num_sgs * sizeof(struct scatterlist);
+ int ret;
rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
if (!rm)
@@ -340,10 +352,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
rm->data.op_nents = ceil(total_len, PAGE_SIZE);
- rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret);
if (!rm->data.op_sg) {
rds_message_put(rm);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
for (i = 0; i < rm->data.op_nents; ++i) {
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 98237feb607a..182ab8430594 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -517,9 +517,10 @@ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
return tot_pages;
}
-int rds_rdma_extra_size(struct rds_rdma_args *args)
+int rds_rdma_extra_size(struct rds_rdma_args *args,
+ struct rds_iov_vector *iov)
{
- struct rds_iovec vec;
+ struct rds_iovec *vec;
struct rds_iovec __user *local_vec;
int tot_pages = 0;
unsigned int nr_pages;
@@ -530,13 +531,23 @@ int rds_rdma_extra_size(struct rds_rdma_args *args)
if (args->nr_local == 0)
return -EINVAL;
+ iov->iov = kcalloc(args->nr_local,
+ sizeof(struct rds_iovec),
+ GFP_KERNEL);
+ if (!iov->iov)
+ return -ENOMEM;
+
+ vec = &iov->iov[0];
+
+ if (copy_from_user(vec, local_vec, args->nr_local *
+ sizeof(struct rds_iovec)))
+ return -EFAULT;
+ iov->len = args->nr_local;
+
/* figure out the number of pages in the vector */
- for (i = 0; i < args->nr_local; i++) {
- if (copy_from_user(&vec, &local_vec[i],
- sizeof(struct rds_iovec)))
- return -EFAULT;
+ for (i = 0; i < args->nr_local; i++, vec++) {
- nr_pages = rds_pages_in_vec(&vec);
+ nr_pages = rds_pages_in_vec(vec);
if (nr_pages == 0)
return -EINVAL;
@@ -558,15 +569,15 @@ int rds_rdma_extra_size(struct rds_rdma_args *args)
* Extract all arguments and set up the rdma_op
*/
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg)
+ struct cmsghdr *cmsg,
+ struct rds_iov_vector *vec)
{
struct rds_rdma_args *args;
struct rm_rdma_op *op = &rm->rdma;
int nr_pages;
unsigned int nr_bytes;
struct page **pages = NULL;
- struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
- int iov_size;
+ struct rds_iovec *iovs;
unsigned int i, j;
int ret = 0;
@@ -586,31 +597,23 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
goto out_ret;
}
- /* Check whether to allocate the iovec area */
- iov_size = args->nr_local * sizeof(struct rds_iovec);
- if (args->nr_local > UIO_FASTIOV) {
- iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
- if (!iovs) {
- ret = -ENOMEM;
- goto out_ret;
- }
+ if (vec->len != args->nr_local) {
+ ret = -EINVAL;
+ goto out_ret;
}
- if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
- ret = -EFAULT;
- goto out;
- }
+ iovs = vec->iov;
nr_pages = rds_rdma_pages(iovs, args->nr_local);
if (nr_pages < 0) {
ret = -EINVAL;
- goto out;
+ goto out_ret;
}
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
- goto out;
+ goto out_ret;
}
op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
@@ -620,11 +623,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
WARN_ON(!nr_pages);
- op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
- if (!op->op_sg) {
- ret = -ENOMEM;
- goto out;
- }
+ op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
+ if (!op->op_sg)
+ goto out_pages;
if (op->op_notify || op->op_recverr) {
/* We allocate an uninitialized notifier here, because
@@ -635,7 +636,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
if (!op->op_notifier) {
ret = -ENOMEM;
- goto out;
+ goto out_pages;
}
op->op_notifier->n_user_token = args->user_token;
op->op_notifier->n_status = RDS_RDMA_SUCCESS;
@@ -681,7 +682,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
*/
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0)
- goto out;
+ goto out_pages;
else
ret = 0;
@@ -714,13 +715,11 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
nr_bytes,
(unsigned int) args->remote_vec.bytes);
ret = -EINVAL;
- goto out;
+ goto out_pages;
}
op->op_bytes = nr_bytes;
-out:
- if (iovs != iovstack)
- sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
+out_pages:
kfree(pages);
out_ret:
if (ret)
@@ -838,11 +837,9 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
rm->atomic.op_active = 1;
rm->atomic.op_recverr = rs->rs_recverr;
- rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
- if (!rm->atomic.op_sg) {
- ret = -ENOMEM;
+ rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1, &ret);
+ if (!rm->atomic.op_sg)
goto err;
- }
/* verify 8 byte-aligned */
if (args->local_addr & 0x7) {
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 6bfaf05b63b2..02ec4a3b2799 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -386,6 +386,18 @@ static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
INIT_LIST_HEAD(&q->zcookie_head);
}
+struct rds_iov_vector {
+ struct rds_iovec *iov;
+ int len;
+};
+
+struct rds_iov_vector_arr {
+ struct rds_iov_vector *vec;
+ int len;
+ int indx;
+ int incr;
+};
+
struct rds_message {
refcount_t m_refcount;
struct list_head m_sock_item;
@@ -827,7 +839,8 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents,
+ int *ret);
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
bool zcopy);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
@@ -904,13 +917,13 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_rdma_extra_size(struct rds_rdma_args *args);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
+int rds_rdma_extra_size(struct rds_rdma_args *args,
+ struct rds_iov_vector *iov);
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
+ struct cmsghdr *cmsg,
+ struct rds_iov_vector *vec);
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
void rds_rdma_free_op(struct rm_rdma_op *ro);
diff --git a/net/rds/send.c b/net/rds/send.c
index fe785ee819dd..3d822bad7de9 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -876,13 +876,18 @@ out:
* rds_message is getting to be quite complicated, and we'd like to allocate
* it all in one go. This figures out how big it needs to be up front.
*/
-static int rds_rm_size(struct msghdr *msg, int num_sgs)
+static int rds_rm_size(struct msghdr *msg, int num_sgs,
+ struct rds_iov_vector_arr *vct)
{
struct cmsghdr *cmsg;
int size = 0;
int cmsg_groups = 0;
int retval;
bool zcopy_cookie = false;
+ struct rds_iov_vector *iov, *tmp_iov;
+
+ if (num_sgs < 0)
+ return -EINVAL;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@@ -893,8 +898,24 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs)
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
+ if (vct->indx >= vct->len) {
+ vct->len += vct->incr;
+ tmp_iov =
+ krealloc(vct->vec,
+ vct->len *
+ sizeof(struct rds_iov_vector),
+ GFP_KERNEL);
+ if (!tmp_iov) {
+ vct->len -= vct->incr;
+ return -ENOMEM;
+ }
+ vct->vec = tmp_iov;
+ }
+ iov = &vct->vec[vct->indx];
+ memset(iov, 0, sizeof(struct rds_iov_vector));
+ vct->indx++;
cmsg_groups |= 1;
- retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+ retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov);
if (retval < 0)
return retval;
size += retval;
@@ -951,10 +972,11 @@ static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
}
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
- struct msghdr *msg, int *allocated_mr)
+ struct msghdr *msg, int *allocated_mr,
+ struct rds_iov_vector_arr *vct)
{
struct cmsghdr *cmsg;
- int ret = 0;
+ int ret = 0, ind = 0;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@@ -968,7 +990,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
*/
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
- ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+ if (ind >= vct->indx)
+ return -ENOMEM;
+ ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]);
+ ind++;
break;
case RDS_CMSG_RDMA_DEST:
@@ -1084,6 +1109,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
int num_sgs = ceil(payload_len, PAGE_SIZE);
int namelen;
+ struct rds_iov_vector_arr vct;
+ int ind;
+
+ memset(&vct, 0, sizeof(vct));
+
+ /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */
+ vct.incr = 1;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1220,7 +1252,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
}
/* size of rm including all sgs */
- ret = rds_rm_size(msg, num_sgs);
+ ret = rds_rm_size(msg, num_sgs, &vct);
if (ret < 0)
goto out;
@@ -1232,11 +1264,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* Attach data to the rm */
if (payload_len) {
- rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
- if (!rm->data.op_sg) {
- ret = -ENOMEM;
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret);
+ if (!rm->data.op_sg)
goto out;
- }
ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
if (ret)
goto out;
@@ -1270,7 +1300,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
rm->m_conn_path = cpath;
/* Parse any control messages the user may have included. */
- ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct);
if (ret) {
/* Trigger connection so that its ready for the next retry */
if (ret == -EAGAIN)
@@ -1348,9 +1378,18 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (ret)
goto out;
rds_message_put(rm);
+
+ for (ind = 0; ind < vct.indx; ind++)
+ kfree(vct.vec[ind].iov);
+ kfree(vct.vec);
+
return payload_len;
out:
+ for (ind = 0; ind < vct.indx; ind++)
+ kfree(vct.vec[ind].iov);
+ kfree(vct.vec);
+
/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
* or in any other way, we need to destroy the MR again */
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 0f8465852254..41a5cd4b5c0e 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -16,7 +16,6 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include <linux/gpio.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9c1b0729aebf..d4b8355737d8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -21,8 +21,6 @@
#include <linux/kmod.h>
#include <linux/err.h>
#include <linux/module.h>
-#include <linux/rhashtable.h>
-#include <linux/list.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/sch_generic.h>
@@ -1522,227 +1520,8 @@ out_module_put:
return skb->len;
}
-struct tcf_action_net {
- struct rhashtable egdev_ht;
-};
-
-static unsigned int tcf_action_net_id;
-
-struct tcf_action_egdev_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_priv;
-};
-
-struct tcf_action_egdev {
- struct rhash_head ht_node;
- const struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
-};
-
-static const struct rhashtable_params tcf_action_egdev_ht_params = {
- .key_offset = offsetof(struct tcf_action_egdev, dev),
- .head_offset = offsetof(struct tcf_action_egdev, ht_node),
- .key_len = sizeof(const struct net_device *),
-};
-
-static struct tcf_action_egdev *
-tcf_action_egdev_lookup(const struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- return rhashtable_lookup_fast(&tan->egdev_ht, &dev,
- tcf_action_egdev_ht_params);
-}
-
-static struct tcf_action_egdev *
-tcf_action_egdev_get(const struct net_device *dev)
-{
- struct tcf_action_egdev *egdev;
- struct tcf_action_net *tan;
-
- egdev = tcf_action_egdev_lookup(dev);
- if (egdev)
- goto inc_ref;
-
- egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
- if (!egdev)
- return NULL;
- INIT_LIST_HEAD(&egdev->cb_list);
- egdev->dev = dev;
- tan = net_generic(dev_net(dev), tcf_action_net_id);
- rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node,
- tcf_action_egdev_ht_params);
-
-inc_ref:
- egdev->refcnt++;
- return egdev;
-}
-
-static void tcf_action_egdev_put(struct tcf_action_egdev *egdev)
-{
- struct tcf_action_net *tan;
-
- if (--egdev->refcnt)
- return;
- tan = net_generic(dev_net(egdev->dev), tcf_action_net_id);
- rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node,
- tcf_action_egdev_ht_params);
- kfree(egdev);
-}
-
-static struct tcf_action_egdev_cb *
-tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- list_for_each_entry(egdev_cb, &egdev->cb_list, list)
- if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv)
- return egdev_cb;
- return NULL;
-}
-
-static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev,
- enum tc_setup_type type,
- void *type_data, bool err_stop)
-{
- struct tcf_action_egdev_cb *egdev_cb;
- int ok_count = 0;
- int err;
-
- list_for_each_entry(egdev_cb, &egdev->cb_list, list) {
- err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv);
- if (err) {
- if (err_stop)
- return err;
- } else {
- ok_count++;
- }
- }
- return ok_count;
-}
-
-static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
- if (WARN_ON(egdev_cb))
- return -EEXIST;
- egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL);
- if (!egdev_cb)
- return -ENOMEM;
- egdev_cb->cb = cb;
- egdev_cb->cb_priv = cb_priv;
- list_add(&egdev_cb->list, &egdev->cb_list);
- return 0;
-}
-
-static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
- if (WARN_ON(!egdev_cb))
- return;
- list_del(&egdev_cb->list);
- kfree(egdev_cb);
-}
-
-static int __tc_setup_cb_egdev_register(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev);
- int err;
-
- if (!egdev)
- return -ENOMEM;
- err = tcf_action_egdev_cb_add(egdev, cb, cb_priv);
- if (err)
- goto err_cb_add;
- return 0;
-
-err_cb_add:
- tcf_action_egdev_put(egdev);
- return err;
-}
-int tc_setup_cb_egdev_register(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- int err;
-
- rtnl_lock();
- err = __tc_setup_cb_egdev_register(dev, cb, cb_priv);
- rtnl_unlock();
- return err;
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register);
-
-static void __tc_setup_cb_egdev_unregister(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
- if (WARN_ON(!egdev))
- return;
- tcf_action_egdev_cb_del(egdev, cb, cb_priv);
- tcf_action_egdev_put(egdev);
-}
-void tc_setup_cb_egdev_unregister(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- rtnl_lock();
- __tc_setup_cb_egdev_unregister(dev, cb, cb_priv);
- rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister);
-
-int tc_setup_cb_egdev_call(const struct net_device *dev,
- enum tc_setup_type type, void *type_data,
- bool err_stop)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
- if (!egdev)
- return 0;
- return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);
-
-static __net_init int tcf_action_net_init(struct net *net)
-{
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params);
-}
-
-static void __net_exit tcf_action_net_exit(struct net *net)
-{
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- rhashtable_destroy(&tan->egdev_ht);
-}
-
-static struct pernet_operations tcf_action_net_ops = {
- .init = tcf_action_net_init,
- .exit = tcf_action_net_exit,
- .id = &tcf_action_net_id,
- .size = sizeof(struct tcf_action_net),
-};
-
static int __init tc_action_init(void)
{
- int err;
-
- err = register_pernet_subsys(&tcf_action_net_ops);
- if (err)
- return err;
-
rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 37c9b8f0e10f..ec8ec55e0fe8 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -85,7 +85,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
int ovr, int bind, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- int ret = 0, err;
+ int ret = 0, tcfp_result = TC_ACT_OK, err, size;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
@@ -93,7 +93,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
struct tc_action_net *tn = net_generic(net, police_net_id);
struct tcf_police_params *new;
bool exists = false;
- int size;
if (nla == NULL)
return -EINVAL;
@@ -160,6 +159,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
goto failure;
}
+ if (tb[TCA_POLICE_RESULT]) {
+ tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+ if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) {
+ NL_SET_ERR_MSG(extack,
+ "goto chain not allowed on fallback");
+ err = -EINVAL;
+ goto failure;
+ }
+ }
+
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (unlikely(!new)) {
err = -ENOMEM;
@@ -167,6 +176,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
/* No failure allowed after this point */
+ new->tcfp_result = tcfp_result;
new->tcfp_mtu = parm->mtu;
if (!new->tcfp_mtu) {
new->tcfp_mtu = ~0;
@@ -196,16 +206,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (tb[TCA_POLICE_AVRATE])
new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
- if (tb[TCA_POLICE_RESULT]) {
- new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
- if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) {
- NL_SET_ERR_MSG(extack,
- "goto chain not allowed on fallback");
- err = -EINVAL;
- goto failure;
- }
- }
-
spin_lock_bh(&police->tcf_lock);
spin_lock_bh(&police->tcfp_lock);
police->tcfp_t_c = ktime_get_ns();
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 4cca8f274662..c3b90fadaff6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -210,9 +210,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct tcf_tunnel_key *t;
bool exists = false;
__be16 dst_port = 0;
+ __be64 key_id = 0;
int opts_len = 0;
- __be64 key_id;
- __be16 flags;
+ __be16 flags = 0;
u8 tos, ttl;
int ret = 0;
int err;
@@ -246,15 +246,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
case TCA_TUNNEL_KEY_ACT_RELEASE:
break;
case TCA_TUNNEL_KEY_ACT_SET:
- if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
- NL_SET_ERR_MSG(extack, "Missing tunnel key id");
- ret = -EINVAL;
- goto err_out;
- }
+ if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ __be32 key32;
- key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+ key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
+ key_id = key32_to_tunnel_id(key32);
+ flags = TUNNEL_KEY;
+ }
- flags = TUNNEL_KEY | TUNNEL_CSUM;
+ flags |= TUNNEL_CSUM;
if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
flags &= ~TUNNEL_CSUM;
@@ -508,10 +508,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
struct ip_tunnel_key *key = &info->key;
__be32 key_id = tunnel_id_to_key32(key->tun_id);
- if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+ if (((key->tun_flags & TUNNEL_KEY) &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
tunnel_key_dump_addresses(skb,
&params->tcft_enc_metadata->u.tun_info) ||
- nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
+ (key->tp_dst &&
+ nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT,
+ key->tp_dst)) ||
nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
!(key->tun_flags & TUNNEL_CSUM)) ||
tunnel_key_opts_dump(skb, info))
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ba677d54a7af..93fdaf707313 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
/* extract existing tag (and guarantee no hw-accel tag) */
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
/* in-payload vlan tag, pop it */
err = __skb_vlan_pop(skb, &tci);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f427a1e00e7e..8ce2a0507970 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/rhashtable.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain)
}
}
+static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
+{
+ const struct Qdisc_class_ops *cops;
+ struct Qdisc *qdisc;
+
+ if (!dev_ingress_queue(dev))
+ return NULL;
+
+ qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+ if (!qdisc)
+ return NULL;
+
+ cops = qdisc->ops->cl_ops;
+ if (!cops)
+ return NULL;
+
+ if (!cops->tcf_block)
+ return NULL;
+
+ return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
+}
+
+static struct rhashtable indr_setup_block_ht;
+
+struct tc_indr_block_dev {
+ struct rhash_head ht_node;
+ struct net_device *dev;
+ unsigned int refcnt;
+ struct list_head cb_list;
+ struct tcf_block *block;
+};
+
+struct tc_indr_block_cb {
+ struct list_head list;
+ void *cb_priv;
+ tc_indr_block_bind_cb_t *cb;
+ void *cb_ident;
+};
+
+static const struct rhashtable_params tc_indr_setup_block_ht_params = {
+ .key_offset = offsetof(struct tc_indr_block_dev, dev),
+ .head_offset = offsetof(struct tc_indr_block_dev, ht_node),
+ .key_len = sizeof(struct net_device *),
+};
+
+static struct tc_indr_block_dev *
+tc_indr_block_dev_lookup(struct net_device *dev)
+{
+ return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+ tc_indr_setup_block_ht_params);
+}
+
+static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
+{
+ struct tc_indr_block_dev *indr_dev;
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (indr_dev)
+ goto inc_ref;
+
+ indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+ if (!indr_dev)
+ return NULL;
+
+ INIT_LIST_HEAD(&indr_dev->cb_list);
+ indr_dev->dev = dev;
+ indr_dev->block = tc_dev_ingress_block(dev);
+ if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ tc_indr_setup_block_ht_params)) {
+ kfree(indr_dev);
+ return NULL;
+ }
+
+inc_ref:
+ indr_dev->refcnt++;
+ return indr_dev;
+}
+
+static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
+{
+ if (--indr_dev->refcnt)
+ return;
+
+ rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ tc_indr_setup_block_ht_params);
+ kfree(indr_dev);
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ if (indr_block_cb->cb == cb &&
+ indr_block_cb->cb_ident == cb_ident)
+ return indr_block_cb;
+ return NULL;
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+
+ indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (indr_block_cb)
+ return ERR_PTR(-EEXIST);
+
+ indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+ if (!indr_block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ indr_block_cb->cb_priv = cb_priv;
+ indr_block_cb->cb = cb;
+ indr_block_cb->cb_ident = cb_ident;
+ list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+ return indr_block_cb;
+}
+
+static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
+{
+ list_del(&indr_block_cb->list);
+ kfree(indr_block_cb);
+}
+
+static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
+ struct tc_indr_block_cb *indr_block_cb,
+ enum tc_block_command command)
+{
+ struct tc_block_offload bo = {
+ .command = command,
+ .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+ .block = indr_dev->block,
+ };
+
+ if (!indr_dev->block)
+ return;
+
+ indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+ &bo);
+}
+
+int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+ int err;
+
+ indr_dev = tc_indr_block_dev_get(dev);
+ if (!indr_dev)
+ return -ENOMEM;
+
+ indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+ err = PTR_ERR_OR_ZERO(indr_block_cb);
+ if (err)
+ goto err_dev_put;
+
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+ return 0;
+
+err_dev_put:
+ tc_indr_block_dev_put(indr_dev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
+
+int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ int err;
+
+ rtnl_lock();
+ err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+ rtnl_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
+
+void __tc_indr_block_cb_unregister(struct net_device *dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (!indr_block_cb)
+ return;
+
+ /* Send unbind message if required to free any block cbs. */
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+ tc_indr_block_cb_del(indr_block_cb);
+ tc_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
+
+void tc_indr_block_cb_unregister(struct net_device *dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ rtnl_lock();
+ __tc_indr_block_cb_unregister(dev, cb, cb_ident);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
+
+static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
+ struct tcf_block_ext_info *ei,
+ enum tc_block_command command,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+ struct tc_block_offload bo = {
+ .command = command,
+ .binder_type = ei->binder_type,
+ .block = block,
+ .extack = extack,
+ };
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+ &bo);
+}
+
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
return block->offloadcnt;
@@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
- return err;
+ if (err)
+ return err;
+
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ return 0;
no_offload_dev_inc:
if (tcf_block_offload_in_use(block))
return -EOPNOTSUPP;
block->nooffloaddevcnt++;
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
return 0;
}
@@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
@@ -1023,29 +1270,6 @@ void tcf_block_cb_unregister(struct tcf_block *block,
}
EXPORT_SYMBOL(tcf_block_cb_unregister);
-static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
- void *type_data, bool err_stop)
-{
- struct tcf_block_cb *block_cb;
- int ok_count = 0;
- int err;
-
- /* Make sure all netdevs sharing this block are offload-capable. */
- if (block->nooffloaddevcnt && err_stop)
- return -EOPNOTSUPP;
-
- list_for_each_entry(block_cb, &block->cb_list, list) {
- err = block_cb->cb(type, type_data, block_cb->cb_priv);
- if (err) {
- if (err_stop)
- return err;
- } else {
- ok_count++;
- }
- }
- return ok_count;
-}
-
/* Main classifier routine: scans classifier chain attached
* to this qdisc, (optionally) tests for protocol and asks
* specific classifiers.
@@ -2268,54 +2492,26 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
-static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
- enum tc_setup_type type,
- void *type_data, bool err_stop)
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
{
+ struct tcf_block_cb *block_cb;
int ok_count = 0;
-#ifdef CONFIG_NET_CLS_ACT
- const struct tc_action *a;
- struct net_device *dev;
- int i, ret;
+ int err;
- if (!tcf_exts_has_actions(exts))
- return 0;
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop)
+ return -EOPNOTSUPP;
- for (i = 0; i < exts->nr_actions; i++) {
- a = exts->actions[i];
- if (!a->ops->get_dev)
- continue;
- dev = a->ops->get_dev(a);
- if (!dev)
- continue;
- ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
- a->ops->put_dev(dev);
- if (ret < 0)
- return ret;
- ok_count += ret;
+ list_for_each_entry(block_cb, &block->cb_list, list) {
+ err = block_cb->cb(type, type_data, block_cb->cb_priv);
+ if (err) {
+ if (err_stop)
+ return err;
+ } else {
+ ok_count++;
+ }
}
-#endif
- return ok_count;
-}
-
-int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
- enum tc_setup_type type, void *type_data, bool err_stop)
-{
- int ok_count;
- int ret;
-
- ret = tcf_block_cb_call(block, type, type_data, err_stop);
- if (ret < 0)
- return ret;
- ok_count = ret;
-
- if (!exts || ok_count)
- return ok_count;
- ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
- if (ret < 0)
- return ret;
- ok_count += ret;
-
return ok_count;
}
EXPORT_SYMBOL(tc_setup_cb_call);
@@ -2355,6 +2551,11 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
+ err = rhashtable_init(&indr_setup_block_ht,
+ &tc_indr_setup_block_ht_params);
+ if (err)
+ goto err_rhash_setup_block_ht;
+
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
@@ -2366,6 +2567,8 @@ static int __init tc_filter_init(void)
return 0;
+err_rhash_setup_block_ht:
+ unregister_pernet_subsys(&tcf_net_ops);
err_register_pernet_subsys:
destroy_workqueue(tc_filter_wq);
return err;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa6fe2fe0f32..a95cb240a606 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -169,7 +169,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
if (oldprog)
tcf_block_offload_dec(block, &oldprog->gen_flags);
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
if (prog) {
if (err < 0) {
cls_bpf_offload_cmd(tp, oldprog, prog, extack);
@@ -234,7 +234,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
}
static int cls_bpf_init(struct tcf_proto *tp)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6c327874abc..dad04e710493 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,8 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+ struct flow_dissector_key_ports tp_min;
+ struct flow_dissector_key_ports tp_max;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -65,6 +67,7 @@ struct fl_flow_mask_range {
struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
+ u32 flags;
struct rhash_head ht_node;
struct rhashtable ht;
struct rhashtable_params filter_ht_params;
@@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
}
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
- struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+ __be16 min_mask, max_mask, min_val, max_val;
+
+ min_mask = htons(filter->mask->key.tp_min.dst);
+ max_mask = htons(filter->mask->key.tp_max.dst);
+ min_val = htons(filter->key.tp_min.dst);
+ max_val = htons(filter->key.tp_max.dst);
+
+ if (min_mask && max_mask) {
+ if (htons(key->tp.dst) < min_val ||
+ htons(key->tp.dst) > max_val)
+ return false;
+
+ /* skb does not have min and max values */
+ mkey->tp_min.dst = filter->mkey.tp_min.dst;
+ mkey->tp_max.dst = filter->mkey.tp_max.dst;
+ }
+ return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+ __be16 min_mask, max_mask, min_val, max_val;
+
+ min_mask = htons(filter->mask->key.tp_min.src);
+ max_mask = htons(filter->mask->key.tp_max.src);
+ min_val = htons(filter->key.tp_min.src);
+ max_val = htons(filter->key.tp_max.src);
+
+ if (min_mask && max_mask) {
+ if (htons(key->tp.src) < min_val ||
+ htons(key->tp.src) > max_val)
+ return false;
+
+ /* skb does not have min and max values */
+ mkey->tp_min.src = filter->mkey.tp_min.src;
+ mkey->tp_max.src = filter->mkey.tp_max.src;
+ }
+ return true;
+}
+
+static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey)
{
return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
mask->filter_ht_params);
}
+static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey,
+ struct fl_flow_key *key)
+{
+ struct cls_fl_filter *filter, *f;
+
+ list_for_each_entry_rcu(filter, &mask->filters, list) {
+ if (!fl_range_port_dst_cmp(filter, key, mkey))
+ continue;
+
+ if (!fl_range_port_src_cmp(filter, key, mkey))
+ continue;
+
+ f = __fl_lookup(mask, mkey);
+ if (f)
+ return f;
+ }
+ return NULL;
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey,
+ struct fl_flow_key *key)
+{
+ if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+ return fl_lookup_range(mask, mkey, key);
+
+ return __fl_lookup(mask, mkey);
+}
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
@@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
fl_set_masked_key(&skb_mkey, &skb_key, mask);
- f = fl_lookup(mask, &skb_mkey);
+ f = fl_lookup(mask, &skb_mkey, &skb_key);
if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
@@ -289,8 +368,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
cls_flower.command = TC_CLSFLOWER_DESTROY;
cls_flower.cookie = (unsigned long) f;
- tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
tcf_block_offload_dec(block, &f->flags);
}
@@ -312,8 +390,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
- err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
if (err < 0) {
fl_hw_destroy_filter(tp, f, NULL);
return err;
@@ -339,8 +416,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
- tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
}
static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
@@ -514,6 +590,31 @@ static void fl_set_key_val(struct nlattr **tb,
memcpy(mask, nla_data(tb[mask_type]), len);
}
+static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ fl_set_key_val(tb, &key->tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
+ fl_set_key_val(tb, &key->tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
+ fl_set_key_val(tb, &key->tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
+ fl_set_key_val(tb, &key->tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
+
+ if ((mask->tp_min.dst && mask->tp_max.dst &&
+ htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
+ (mask->tp_min.src && mask->tp_max.src &&
+ htons(key->tp_max.src) <= htons(key->tp_min.src)))
+ return -EINVAL;
+
+ return 0;
+}
+
static int fl_set_key_mpls(struct nlattr **tb,
struct flow_dissector_key_mpls *key_val,
struct flow_dissector_key_mpls *key_mask)
@@ -921,6 +1022,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->arp.tha));
}
+ if (key->basic.ip_proto == IPPROTO_TCP ||
+ key->basic.ip_proto == IPPROTO_UDP ||
+ key->basic.ip_proto == IPPROTO_SCTP) {
+ ret = fl_set_key_port_range(tb, key, mask);
+ if (ret)
+ return ret;
+ }
+
if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1038,8 +1147,9 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- FL_KEY_SET_IF_MASKED(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_PORTS, tp);
+ if (FL_KEY_IS_MASKED(mask, tp) ||
+ FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1086,6 +1196,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
fl_mask_copy(newmask, mask);
+ if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
+ (newmask->key.tp_min.src && newmask->key.tp_max.src))
+ newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
+
err = fl_init_mask_hashtable(newmask);
if (err)
goto errout_free;
@@ -1238,18 +1352,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout_idr;
- if (!tc_skip_sw(fnew->flags)) {
- if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
- err = -EEXIST;
- goto errout_mask;
- }
-
- err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
- fnew->mask->filter_ht_params);
- if (err)
- goto errout_mask;
+ if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
+ err = -EEXIST;
+ goto errout_mask;
}
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
+ if (err)
+ goto errout_mask;
+
if (!tc_skip_hw(fnew->flags)) {
err = fl_hw_replace_filter(tp, fnew, extack);
if (err)
@@ -1260,10 +1372,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
if (fold) {
- if (!tc_skip_sw(fold->flags))
- rhashtable_remove_fast(&fold->mask->ht,
- &fold->ht_node,
- fold->mask->filter_ht_params);
+ rhashtable_remove_fast(&fold->mask->ht,
+ &fold->ht_node,
+ fold->mask->filter_ht_params);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
}
@@ -1303,9 +1414,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f = arg;
- if (!tc_skip_sw(f->flags))
- rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
- f->mask->filter_ht_params);
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
__fl_delete(tp, f, extack);
*last = list_empty(&head->masks);
return 0;
@@ -1388,8 +1498,7 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain,
/* We don't care if driver (any of them) fails to handle this
* call. It serves just as a hint for it.
*/
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
}
static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
@@ -1402,8 +1511,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
}
static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
@@ -1476,6 +1584,26 @@ static int fl_dump_key_val(struct sk_buff *skb,
return 0;
}
+static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
+ &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_min.dst)) ||
+ fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
+ &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_max.dst)) ||
+ fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
+ &mask->tp_min.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_min.src)) ||
+ fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
+ &mask->tp_max.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_max.src)))
+ return -1;
+
+ return 0;
+}
+
static int fl_dump_key_mpls(struct sk_buff *skb,
struct flow_dissector_key_mpls *mpls_key,
struct flow_dissector_key_mpls *mpls_mask)
@@ -1812,6 +1940,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
sizeof(key->arp.tha))))
goto nla_put_failure;
+ if ((key->basic.ip_proto == IPPROTO_TCP ||
+ key->basic.ip_proto == IPPROTO_UDP ||
+ key->basic.ip_proto == IPPROTO_SCTP) &&
+ fl_dump_key_port_range(skb, key, mask))
+ goto nla_put_failure;
+
if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
(fl_dump_key_val(skb, &key->enc_ipv4.src,
TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 856fa79d4ffd..0e408ee9dcec 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -71,7 +71,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
tcf_block_offload_dec(block, &head->flags);
}
@@ -90,8 +90,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
cls_mall.exts = &head->exts;
cls_mall.cookie = cookie;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL,
- &cls_mall, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
if (err < 0) {
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4b28fd44576d..dcea21004604 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
}
static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
if (err < 0) {
u32_clear_hw_hnode(tp, h, NULL);
return err;
@@ -533,7 +533,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.command = TC_CLSU32_DELETE_KNODE;
cls_u32.knode.handle = n->handle;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
tcf_block_offload_dec(block, &n->flags);
}
@@ -558,11 +558,12 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.mask = 0;
#endif
cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.res = &n->res;
cls_u32.knode.exts = &n->exts;
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
if (err < 0) {
u32_remove_hw_knode(tp, n, NULL);
return err;
@@ -1206,6 +1207,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.mask = 0;
#endif
cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.res = &n->res;
cls_u32.knode.exts = &n->exts;
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ca3b0f46de53..7e4d1ccf4c87 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -335,7 +335,6 @@ out:
static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
unsigned long cl;
- struct Qdisc *leaf;
const struct Qdisc_class_ops *cops = p->ops->cl_ops;
if (cops == NULL)
@@ -344,8 +343,7 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
if (cl == 0)
return NULL;
- leaf = cops->leaf(p, cl);
- return leaf;
+ return cops->leaf(p, cl);
}
/* Find queueing discipline by name */
@@ -540,7 +538,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+ call_rcu(&tab->rcu, stab_kfree_rcu);
}
}
EXPORT_SYMBOL(qdisc_put_stab);
@@ -810,6 +808,71 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
}
EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
+int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
+ void *type_data)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ int err;
+
+ sch->flags &= ~TCQ_F_OFFLOADED;
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return 0;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (!err)
+ sch->flags |= TCQ_F_OFFLOADED;
+
+ return err;
+}
+EXPORT_SYMBOL(qdisc_offload_dump_helper);
+
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+ struct Qdisc *new, struct Qdisc *old,
+ enum tc_setup_type type, void *type_data,
+ struct netlink_ext_ack *extack)
+{
+ bool any_qdisc_is_offloaded;
+ int err;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+
+ /* Don't report error if the graft is part of destroy operation. */
+ if (!err || !new || new == &noop_qdisc)
+ return;
+
+ /* Don't report error if the parent, the old child and the new
+ * one are not offloaded.
+ */
+ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
+
+ if (any_qdisc_is_offloaded)
+ NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+}
+EXPORT_SYMBOL(qdisc_offload_graft_helper);
+
+static void qdisc_offload_graft_root(struct net_device *dev,
+ struct Qdisc *new, struct Qdisc *old,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_root_qopt_offload graft_offload = {
+ .command = TC_ROOT_GRAFT,
+ .handle = new ? new->handle : 0,
+ .ingress = (new && new->flags & TCQ_F_INGRESS) ||
+ (old && old->flags & TCQ_F_INGRESS),
+ };
+
+ qdisc_offload_graft_helper(dev, NULL, new, old,
+ TC_SETUP_ROOT_QDISC, &graft_offload, extack);
+}
+
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 portid, u32 seq, u16 flags, int event)
{
@@ -957,7 +1020,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
{
struct Qdisc *q = old;
struct net *net = dev_net(dev);
- int err = 0;
if (parent == NULL) {
unsigned int i, num_q, ingress;
@@ -977,6 +1039,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (dev->flags & IFF_UP)
dev_deactivate(dev);
+ qdisc_offload_graft_root(dev, new, old, extack);
+
if (new && new->ops->attach)
goto skip;
@@ -1012,28 +1076,29 @@ skip:
dev_activate(dev);
} else {
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+ unsigned long cl;
+ int err;
/* Only support running class lockless if parent is lockless */
if (new && (new->flags & TCQ_F_NOLOCK) &&
parent && !(parent->flags & TCQ_F_NOLOCK))
new->flags &= ~TCQ_F_NOLOCK;
- err = -EOPNOTSUPP;
- if (cops && cops->graft) {
- unsigned long cl = cops->find(parent, classid);
+ if (!cops || !cops->graft)
+ return -EOPNOTSUPP;
- if (cl) {
- err = cops->graft(parent, cl, new, &old,
- extack);
- } else {
- NL_SET_ERR_MSG(extack, "Specified class not found");
- err = -ENOENT;
- }
+ cl = cops->find(parent, classid);
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ return -ENOENT;
}
- if (!err)
- notify_and_destroy(net, skb, n, classid, old, new);
+
+ err = cops->graft(parent, cl, new, &old, extack);
+ if (err)
+ return err;
+ notify_and_destroy(net, skb, n, classid, old, new);
}
- return err;
+ return 0;
}
static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 1538d6fa8165..1150f22983df 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -30,7 +30,7 @@ struct etf_sched_data {
int queue;
s32 delta; /* in ns */
ktime_t last; /* The txtime of the last skb sent to the netdevice. */
- struct rb_root head;
+ struct rb_root_cached head;
struct qdisc_watchdog watchdog;
ktime_t (*get_time)(void);
};
@@ -104,7 +104,7 @@ static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
struct etf_sched_data *q = qdisc_priv(sch);
struct rb_node *p;
- p = rb_first(&q->head);
+ p = rb_first_cached(&q->head);
if (!p)
return NULL;
@@ -117,8 +117,10 @@ static void reset_watchdog(struct Qdisc *sch)
struct sk_buff *skb = etf_peek_timesortedlist(sch);
ktime_t next;
- if (!skb)
+ if (!skb) {
+ qdisc_watchdog_cancel(&q->watchdog);
return;
+ }
next = ktime_sub_ns(skb->tstamp, q->delta);
qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
@@ -154,8 +156,9 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct etf_sched_data *q = qdisc_priv(sch);
- struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL;
ktime_t txtime = nskb->tstamp;
+ bool leftmost = true;
if (!is_packet_valid(sch, nskb)) {
report_sock_error(nskb, EINVAL,
@@ -168,13 +171,15 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
parent = *p;
skb = rb_to_skb(parent);
- if (ktime_after(txtime, skb->tstamp))
+ if (ktime_after(txtime, skb->tstamp)) {
p = &parent->rb_right;
- else
+ leftmost = false;
+ } else {
p = &parent->rb_left;
+ }
}
rb_link_node(&nskb->rbnode, parent, p);
- rb_insert_color(&nskb->rbnode, &q->head);
+ rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost);
qdisc_qstats_backlog_inc(sch, nskb);
sch->q.qlen++;
@@ -185,12 +190,42 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
return NET_XMIT_SUCCESS;
}
-static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
- bool drop)
+static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb,
+ ktime_t now)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *to_free = NULL;
+ struct sk_buff *tmp = NULL;
+
+ skb_rbtree_walk_from_safe(skb, tmp) {
+ if (ktime_after(skb->tstamp, now))
+ break;
+
+ rb_erase_cached(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch, &to_free);
+ qdisc_qstats_overlimit(sch);
+ sch->q.qlen--;
+ }
+
+ kfree_skb_list(to_free);
+}
+
+static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb)
{
struct etf_sched_data *q = qdisc_priv(sch);
- rb_erase(&skb->rbnode, &q->head);
+ rb_erase_cached(&skb->rbnode, &q->head);
/* The rbnode field in the skb re-uses these fields, now that
* we are done with the rbnode, reset them.
@@ -201,19 +236,9 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
qdisc_qstats_backlog_dec(sch, skb);
- if (drop) {
- struct sk_buff *to_free = NULL;
+ qdisc_bstats_update(sch, skb);
- report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
-
- qdisc_drop(skb, sch, &to_free);
- kfree_skb_list(to_free);
- qdisc_qstats_overlimit(sch);
- } else {
- qdisc_bstats_update(sch, skb);
-
- q->last = skb->tstamp;
- }
+ q->last = skb->tstamp;
sch->q.qlen--;
}
@@ -232,7 +257,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
/* Drop if packet has expired while in queue. */
if (ktime_before(skb->tstamp, now)) {
- timesortedlist_erase(sch, skb, true);
+ timesortedlist_drop(sch, skb, now);
skb = NULL;
goto out;
}
@@ -241,7 +266,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
* txtime from deadline to (now + delta).
*/
if (q->deadline_mode) {
- timesortedlist_erase(sch, skb, false);
+ timesortedlist_remove(sch, skb);
skb->tstamp = now;
goto out;
}
@@ -250,7 +275,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
/* Dequeue only if now is within the [txtime - delta, txtime] range. */
if (ktime_after(now, next))
- timesortedlist_erase(sch, skb, false);
+ timesortedlist_remove(sch, skb);
else
skb = NULL;
@@ -386,14 +411,14 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
static void timesortedlist_clear(struct Qdisc *sch)
{
struct etf_sched_data *q = qdisc_priv(sch);
- struct rb_node *p = rb_first(&q->head);
+ struct rb_node *p = rb_first_cached(&q->head);
while (p) {
struct sk_buff *skb = rb_to_skb(p);
p = rb_next(p);
- rb_erase(&skb->rbnode, &q->head);
+ rb_erase_cached(&skb->rbnode, &q->head);
rtnl_kfree_skbs(skb, skb);
sch->q.qlen--;
}
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 25a7cf6d380f..1a662f2bb7bb 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
u32 flow_refill_delay;
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
+ u64 ce_threshold;
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
@@ -107,6 +108,7 @@ struct fq_sched_data {
u64 stat_gc_flows;
u64 stat_internal_packets;
u64 stat_throttled;
+ u64 stat_ce_mark;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
@@ -412,16 +414,21 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 now = ktime_get_ns();
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
unsigned long rate;
u32 plen;
+ u64 now;
+
+ if (!sch->q.qlen)
+ return NULL;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
goto out;
+
+ now = ktime_get_ns();
fq_check_throttled(q, now);
begin:
head = &q->new_flows;
@@ -454,6 +461,11 @@ begin:
fq_flow_set_throttled(q, f);
goto begin;
}
+ if (time_next_packet &&
+ (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ INET_ECN_set_ce(skb);
+ q->stat_ce_mark++;
+ }
}
skb = fq_dequeue_head(sch, f);
@@ -657,6 +669,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -736,6 +749,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_ORPHAN_MASK])
q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+ if (tb[TCA_FQ_CE_THRESHOLD])
+ q->ce_threshold = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
@@ -786,6 +803,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+
+ /* Default ce_threshold of 4294 seconds */
+ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
+
qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
@@ -799,6 +820,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
+ u64 ce_threshold = q->ce_threshold;
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
@@ -807,6 +829,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+ do_div(ce_threshold, NSEC_PER_USEC);
+
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
@@ -819,6 +843,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
+ nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -848,6 +873,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.throttled_flows = q->throttled_flows;
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
+ st.ce_mark = q->stat_ce_mark;
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index de1663f7d3ad..66ba2ce2320f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1372,7 +1372,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
if (!tp_head) {
RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
/* Wait for flying RCU callback before it is freed. */
- rcu_barrier_bh();
+ rcu_barrier();
return;
}
@@ -1380,10 +1380,10 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
&miniqp->miniq1 : &miniqp->miniq2;
/* We need to make sure that readers won't see the miniq
- * we are about to modify. So wait until previous call_rcu_bh callback
+ * we are about to modify. So wait until previous call_rcu callback
* is done.
*/
- rcu_barrier_bh();
+ rcu_barrier();
miniq->filter_list = tp_head;
rcu_assign_pointer(*miniqp->p_miniq, miniq);
@@ -1392,7 +1392,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
* block potential new user of miniq_old until all readers
* are not seeing it.
*/
- call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
+ call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 4a042abf844c..234afbf9115b 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -23,19 +23,23 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
#include <net/red.h>
#define GRED_DEF_PRIO (MAX_DPs / 2)
#define GRED_VQ_MASK (MAX_DPs - 1)
+#define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP)
+
struct gred_sched_data;
struct gred_sched;
struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop parameters */
- u32 bytesin; /* bytes seen on virtualQ so far*/
+ u32 red_flags; /* virtualQ version of red_flags */
+ u64 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
@@ -139,14 +143,27 @@ static inline void gred_store_wred_set(struct gred_sched *table,
table->wred_set.qidlestart = q->vars.qidlestart;
}
-static inline int gred_use_ecn(struct gred_sched *t)
+static int gred_use_ecn(struct gred_sched_data *q)
+{
+ return q->red_flags & TC_RED_ECN;
+}
+
+static int gred_use_harddrop(struct gred_sched_data *q)
{
- return t->red_flags & TC_RED_ECN;
+ return q->red_flags & TC_RED_HARDDROP;
}
-static inline int gred_use_harddrop(struct gred_sched *t)
+static bool gred_per_vq_red_flags_used(struct gred_sched *table)
{
- return t->red_flags & TC_RED_HARDDROP;
+ unsigned int i;
+
+ /* Local per-vq flags couldn't have been set unless global are 0 */
+ if (table->red_flags)
+ return false;
+ for (i = 0; i < MAX_DPs; i++)
+ if (table->tab[i] && table->tab[i]->red_flags)
+ return true;
+ return false;
}
static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -212,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
- if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
@@ -222,7 +239,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
- if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ if (gred_use_harddrop(q) || !gred_use_ecn(q) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
@@ -295,15 +312,103 @@ static void gred_reset(struct Qdisc *sch)
}
}
+static void gred_offload(struct Qdisc *sch, enum tc_gred_command command)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_gred_qopt_offload opt = {
+ .command = command,
+ .handle = sch->handle,
+ .parent = sch->parent,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ if (command == TC_GRED_REPLACE) {
+ unsigned int i;
+
+ opt.set.grio_on = gred_rio_mode(table);
+ opt.set.wred_on = gred_wred_mode(table);
+ opt.set.dp_cnt = table->DPs;
+ opt.set.dp_def = table->def;
+
+ for (i = 0; i < table->DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ if (!q)
+ continue;
+ opt.set.tab[i].present = true;
+ opt.set.tab[i].limit = q->limit;
+ opt.set.tab[i].prio = q->prio;
+ opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog;
+ opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog;
+ opt.set.tab[i].is_ecn = gred_use_ecn(q);
+ opt.set.tab[i].is_harddrop = gred_use_harddrop(q);
+ opt.set.tab[i].probability = q->parms.max_P;
+ opt.set.tab[i].backlog = &q->backlog;
+ }
+ opt.set.qstats = &sch->qstats;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt);
+}
+
+static int gred_offload_dump_stats(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_qopt_offload *hw_stats;
+ unsigned int i;
+ int ret;
+
+ hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL);
+ if (!hw_stats)
+ return -ENOMEM;
+
+ hw_stats->command = TC_GRED_STATS;
+ hw_stats->handle = sch->handle;
+ hw_stats->parent = sch->parent;
+
+ for (i = 0; i < MAX_DPs; i++)
+ if (table->tab[i])
+ hw_stats->stats.xstats[i] = &table->tab[i]->stats;
+
+ ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats);
+ /* Even if driver returns failure adjust the stats - in case offload
+ * ended but driver still wants to adjust the values.
+ */
+ for (i = 0; i < MAX_DPs; i++) {
+ if (!table->tab[i])
+ continue;
+ table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets;
+ table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes;
+ table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog;
+
+ _bstats_update(&sch->bstats,
+ hw_stats->stats.bstats[i].bytes,
+ hw_stats->stats.bstats[i].packets);
+ sch->qstats.qlen += hw_stats->stats.qstats[i].qlen;
+ sch->qstats.backlog += hw_stats->stats.qstats[i].backlog;
+ sch->qstats.drops += hw_stats->stats.qstats[i].drops;
+ sch->qstats.requeues += hw_stats->stats.qstats[i].requeues;
+ sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits;
+ }
+
+ kfree(hw_stats);
+ return ret;
+}
+
static inline void gred_destroy_vq(struct gred_sched_data *q)
{
kfree(q);
}
-static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
+ struct netlink_ext_ack *extack)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_sopt *sopt;
+ bool red_flags_changed;
int i;
if (!dps)
@@ -311,13 +416,28 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
sopt = nla_data(dps);
- if (sopt->DPs > MAX_DPs || sopt->DPs == 0 ||
- sopt->def_DP >= sopt->DPs)
+ if (sopt->DPs > MAX_DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high");
+ return -EINVAL;
+ }
+ if (sopt->DPs == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "number of virtual queues can't be 0");
+ return -EINVAL;
+ }
+ if (sopt->def_DP >= sopt->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count");
return -EINVAL;
+ }
+ if (sopt->flags && gred_per_vq_red_flags_used(table)) {
+ NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used");
+ return -EINVAL;
+ }
sch_tree_lock(sch);
table->DPs = sopt->DPs;
table->def = sopt->def_DP;
+ red_flags_changed = table->red_flags != sopt->flags;
table->red_flags = sopt->flags;
/*
@@ -337,6 +457,12 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
gred_disable_wred_mode(table);
}
+ if (red_flags_changed)
+ for (i = 0; i < table->DPs; i++)
+ if (table->tab[i])
+ table->tab[i]->red_flags =
+ table->red_flags & GRED_VQ_RED_FLAGS;
+
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
@@ -346,25 +472,30 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
}
}
+ gred_offload(sch, TC_GRED_REPLACE);
return 0;
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct tc_gred_qopt *ctl, int prio,
u8 *stab, u32 max_P,
- struct gred_sched_data **prealloc)
+ struct gred_sched_data **prealloc,
+ struct netlink_ext_ack *extack)
{
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q = table->tab[dp];
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
return -EINVAL;
+ }
if (!q) {
table->tab[dp] = q = *prealloc;
*prealloc = NULL;
if (!q)
return -ENOMEM;
+ q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS;
}
q->DP = dp;
@@ -384,14 +515,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
return 0;
}
+static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = {
+ [TCA_GRED_VQ_DP] = { .type = NLA_U32 },
+ [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = {
+ [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED },
+};
+
static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
[TCA_GRED_MAX_P] = { .type = NLA_U32 },
[TCA_GRED_LIMIT] = { .type = NLA_U32 },
+ [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED },
};
+static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry)
+{
+ struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+ u32 dp;
+
+ nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL);
+
+ dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+
+ if (tb[TCA_GRED_VQ_FLAGS])
+ table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+}
+
+static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, vqs, rem) {
+ switch (nla_type(attr)) {
+ case TCA_GRED_VQ_ENTRY:
+ gred_vq_apply(table, attr);
+ break;
+ }
+ }
+}
+
+static int gred_vq_validate(struct gred_sched *table, u32 cdp,
+ const struct nlattr *entry,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+ int err;
+ u32 dp;
+
+ err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_GRED_VQ_DP]) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified");
+ return -EINVAL;
+ }
+ dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+ if (dp >= table->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds");
+ return -EINVAL;
+ }
+ if (dp != cdp && !table->tab[dp]) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_GRED_VQ_FLAGS]) {
+ u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+
+ if (table->red_flags && table->red_flags != red_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used");
+ return -EINVAL;
+ }
+ if (red_flags & ~GRED_VQ_RED_FLAGS) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "invalid RED flags specified");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int gred_vqs_validate(struct gred_sched *table, u32 cdp,
+ struct nlattr *vqs, struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int rem, err;
+
+ err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX,
+ gred_vqe_policy, extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, vqs, rem) {
+ switch (nla_type(attr)) {
+ case TCA_GRED_VQ_ENTRY:
+ err = gred_vq_validate(table, cdp, attr, extack);
+ if (err)
+ return err;
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes");
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int gred_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -406,29 +650,39 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) {
if (tb[TCA_GRED_LIMIT] != NULL)
sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
- return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
}
if (tb[TCA_GRED_PARMS] == NULL ||
tb[TCA_GRED_STAB] == NULL ||
- tb[TCA_GRED_LIMIT] != NULL)
+ tb[TCA_GRED_LIMIT] != NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time");
return -EINVAL;
+ }
max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
- err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
- if (ctl->DP >= table->DPs)
- goto errout;
+ if (ctl->DP >= table->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_GRED_VQ_LIST]) {
+ err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST],
+ extack);
+ if (err)
+ return err;
+ }
if (gred_rio_mode(table)) {
if (ctl->prio == 0) {
@@ -448,9 +702,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
sch_tree_lock(sch);
- err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc,
+ extack);
if (err < 0)
- goto errout_locked;
+ goto err_unlock_free;
+
+ if (tb[TCA_GRED_VQ_LIST])
+ gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]);
if (gred_rio_mode(table)) {
gred_disable_wred_mode(table);
@@ -458,12 +716,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
gred_enable_wred_mode(table);
}
- err = 0;
+ sch_tree_unlock(sch);
+ kfree(prealloc);
+
+ gred_offload(sch, TC_GRED_REPLACE);
+ return 0;
-errout_locked:
+err_unlock_free:
sch_tree_unlock(sch);
kfree(prealloc);
-errout:
return err;
}
@@ -476,12 +737,15 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
if (err < 0)
return err;
- if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+ if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "virtual queue configuration can't be specified at initialization time");
return -EINVAL;
+ }
if (tb[TCA_GRED_LIMIT])
sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
@@ -489,13 +753,13 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
sch->limit = qdisc_dev(sch)->tx_queue_len
* psched_mtu(qdisc_dev(sch));
- return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
}
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct gred_sched *table = qdisc_priv(sch);
- struct nlattr *parms, *opts = NULL;
+ struct nlattr *parms, *vqs, *opts = NULL;
int i;
u32 max_p[MAX_DPs];
struct tc_gred_sopt sopt = {
@@ -505,6 +769,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
.flags = table->red_flags,
};
+ if (gred_offload_dump_stats(sch))
+ goto nla_put_failure;
+
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
@@ -522,6 +789,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
goto nla_put_failure;
+ /* Old style all-in-one dump of VQs */
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
@@ -572,6 +840,58 @@ append_opt:
nla_nest_end(skb, parms);
+ /* Dump the VQs again, in more structured way */
+ vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST);
+ if (!vqs)
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ struct nlattr *vq;
+
+ if (!q)
+ continue;
+
+ vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY);
+ if (!vq)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags))
+ goto nla_put_failure;
+
+ /* Stats */
+ if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
+ TCA_GRED_VQ_PAD))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG,
+ gred_backlog(table, q, sch)))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP,
+ q->stats.prob_drop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK,
+ q->stats.prob_mark))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP,
+ q->stats.forced_drop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK,
+ q->stats.forced_mark))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, vq);
+ }
+ nla_nest_end(skb, vqs);
+
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -588,6 +908,7 @@ static void gred_destroy(struct Qdisc *sch)
if (table->tab[i])
gred_destroy_vq(table->tab[i]);
}
+ gred_offload(sch, TC_GRED_DESTROY);
}
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f20f3a0f8424..203659bc3906 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
}
-static void mq_offload_stats(struct Qdisc *sch)
+static int mq_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_mq_qopt_offload opt = {
.command = TC_MQ_STATS,
.handle = sch->handle,
@@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch)
},
};
- if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
}
static void mq_destroy(struct Qdisc *sch)
@@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
- mq_offload_stats(sch);
- return 0;
+ return mq_offload_stats(sch);
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
@@ -196,6 +193,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+ struct tc_mq_qopt_offload graft_offload;
struct net_device *dev = qdisc_dev(sch);
if (dev->flags & IFF_UP)
@@ -206,6 +204,14 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
+
+ graft_offload.handle = sch->handle;
+ graft_offload.graft_params.queue = cl - 1;
+ graft_offload.graft_params.child_handle = new ? new->handle : 0;
+ graft_offload.command = TC_MQ_GRAFT;
+
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+ TC_SETUP_QDISC_MQ, &graft_offload, extack);
return 0;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2c38e3d07924..75046ec72144 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -77,6 +77,10 @@ struct netem_sched_data {
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
+ /* a linear queue; reduces rbtree rebalancing when jitter is low */
+ struct sk_buff *t_head;
+ struct sk_buff *t_tail;
+
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
@@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch)
rb_erase(&skb->rbnode, &q->t_root);
rtnl_kfree_skbs(skb, skb);
}
+
+ rtnl_kfree_skbs(q->t_head, q->t_tail);
+ q->t_head = NULL;
+ q->t_tail = NULL;
}
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
u64 tnext = netem_skb_cb(nskb)->time_to_send;
- struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
- while (*p) {
- struct sk_buff *skb;
-
- parent = *p;
- skb = rb_to_skb(parent);
- if (tnext >= netem_skb_cb(skb)->time_to_send)
- p = &parent->rb_right;
+ if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+ if (q->t_tail)
+ q->t_tail->next = nskb;
else
- p = &parent->rb_left;
+ q->t_head = nskb;
+ q->t_tail = nskb;
+ } else {
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->t_root);
}
- rb_link_node(&nskb->rbnode, parent, p);
- rb_insert_color(&nskb->rbnode, &q->t_root);
sch->q.qlen++;
}
@@ -431,6 +448,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
int count = 1;
int rc = NET_XMIT_SUCCESS;
+ /* Do not fool qdisc_drop_all() */
+ skb->prev = NULL;
+
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
@@ -530,9 +550,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
t_skb = skb_rb_last(&q->t_root);
t_last = netem_skb_cb(t_skb);
if (!last ||
- t_last->time_to_send > last->time_to_send) {
+ t_last->time_to_send > last->time_to_send)
+ last = t_last;
+ }
+ if (q->t_tail) {
+ struct netem_skb_cb *t_last =
+ netem_skb_cb(q->t_tail);
+
+ if (!last ||
+ t_last->time_to_send > last->time_to_send)
last = t_last;
- }
}
if (last) {
@@ -611,11 +638,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
q->slot.bytes_left = q->slot_config.max_bytes;
}
+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+ struct sk_buff *skb = skb_rb_first(&q->t_root);
+ u64 t1, t2;
+
+ if (!skb)
+ return q->t_head;
+ if (!q->t_head)
+ return skb;
+
+ t1 = netem_skb_cb(skb)->time_to_send;
+ t2 = netem_skb_cb(q->t_head)->time_to_send;
+ if (t1 < t2)
+ return skb;
+ return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+ if (skb == q->t_head) {
+ q->t_head = skb->next;
+ if (!q->t_head)
+ q->t_tail = NULL;
+ } else {
+ rb_erase(&skb->rbnode, &q->t_root);
+ }
+}
+
static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- struct rb_node *p;
tfifo_dequeue:
skb = __qdisc_dequeue_head(&sch->q);
@@ -625,20 +679,18 @@ deliver:
qdisc_bstats_update(sch, skb);
return skb;
}
- p = rb_first(&q->t_root);
- if (p) {
+ skb = netem_peek(q);
+ if (skb) {
u64 time_to_send;
u64 now = ktime_get_ns();
- skb = rb_to_skb(p);
-
/* if more time remaining? */
time_to_send = netem_skb_cb(skb)->time_to_send;
if (q->slot.slot_next && q->slot.slot_next < time_to_send)
get_slot_next(q, now);
- if (time_to_send <= now && q->slot.slot_next <= now) {
- rb_erase(p, &q->t_root);
+ if (time_to_send <= now && q->slot.slot_next <= now) {
+ netem_erase_head(q, skb);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
skb->next = NULL;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index f8af98621179..cdf68706e40f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_put(child);
}
for (i = oldbands; i < q->bands; i++) {
@@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
}
sch_tree_unlock(sch);
+
+ for (i = q->bands; i < oldbands; i++)
+ qdisc_put(q->queues[i]);
return 0;
}
@@ -251,7 +253,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
static int prio_dump_offload(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_prio_qopt_offload hw_stats = {
.command = TC_PRIO_STATS,
.handle = sch->handle,
@@ -263,21 +264,8 @@ static int prio_dump_offload(struct Qdisc *sch)
},
},
};
- int err;
-
- sch->flags &= ~TCQ_F_OFFLOADED;
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
-
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -309,43 +297,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
{
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt_offload graft_offload;
- struct net_device *dev = qdisc_dev(sch);
unsigned long band = arg - 1;
- bool any_qdisc_is_offloaded;
- int err;
if (new == NULL)
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->queues[band]);
- if (!tc_can_offload(dev))
- return 0;
-
graft_offload.handle = sch->handle;
graft_offload.parent = sch->parent;
graft_offload.graft_params.band = band;
graft_offload.graft_params.child_handle = new->handle;
graft_offload.command = TC_PRIO_GRAFT;
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &graft_offload);
-
- /* Don't report error if the graft is part of destroy operation. */
- if (err && new != &noop_qdisc) {
- /* Don't report error if the parent, the old child and the new
- * one are not offloaded.
- */
- any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
- any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
- if (*old)
- any_qdisc_is_offloaded |= (*old)->flags &
- TCQ_F_OFFLOADED;
-
- if (any_qdisc_is_offloaded)
- NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
- }
-
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+ TC_SETUP_QDISC_PRIO, &graft_offload,
+ extack);
return 0;
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3ce6c0a2c493..9df9942340ea 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -166,7 +166,9 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.set.min = q->parms.qth_min >> q->parms.Wlog;
opt.set.max = q->parms.qth_max >> q->parms.Wlog;
opt.set.probability = q->parms.max_P;
+ opt.set.limit = q->limit;
opt.set.is_ecn = red_use_ecn(q);
+ opt.set.is_harddrop = red_use_harddrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -193,10 +195,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
static int red_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
struct tc_red_qopt *ctl;
- struct Qdisc *child = NULL;
int err;
u32 max_P;
@@ -233,7 +235,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_put(q->qdisc);
+ old_child = q->qdisc;
q->qdisc = child;
}
@@ -252,7 +254,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
+
red_offload(sch, true);
+
+ if (old_child)
+ qdisc_put(old_child);
return 0;
}
@@ -279,9 +285,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
return red_change(sch, opt, extack);
}
-static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_red_qopt_offload hw_stats = {
.command = TC_RED_STATS,
.handle = sch->handle,
@@ -291,22 +296,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
.stats.qstats = &sch->qstats,
},
};
- int err;
-
- sch->flags &= ~TCQ_F_OFFLOADED;
-
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
-
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats);
}
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -324,7 +315,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
};
int err;
- err = red_dump_offload_stats(sch, &opt);
+ err = red_dump_offload_stats(sch);
if (err)
goto nla_put_failure;
@@ -377,6 +368,21 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl,
return 0;
}
+static void red_graft_offload(struct Qdisc *sch,
+ struct Qdisc *new, struct Qdisc *old,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_red_qopt_offload graft_offload = {
+ .handle = sch->handle,
+ .parent = sch->parent,
+ .child_handle = new->handle,
+ .command = TC_RED_GRAFT,
+ };
+
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
+ TC_SETUP_QDISC_RED, &graft_offload, extack);
+}
+
static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
{
@@ -386,6 +392,8 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->qdisc);
+
+ red_graft_offload(sch, new, *old, extack);
return 0;
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96e779e..201c888604e4 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init(
asoc->flowlabel = sp->flowlabel;
asoc->dscp = sp->dscp;
- /* Initialize default path MTU. */
- asoc->pathmtu = sp->pathmtu;
-
/* Set association default SACK delay */
asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
asoc->sackfreq = sp->sackfreq;
@@ -135,6 +132,8 @@ static struct sctp_association *sctp_association_init(
*/
asoc->max_burst = sp->max_burst;
+ asoc->subscribe = sp->subscribe;
+
/* initialize association timers */
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
@@ -252,6 +251,10 @@ static struct sctp_association *sctp_association_init(
0, gfp))
goto fail_init;
+ /* Initialize default path MTU. */
+ asoc->pathmtu = sp->pathmtu;
+ sctp_assoc_update_frag_point(asoc);
+
/* Assume that peer would support both address types unless we are
* told otherwise.
*/
@@ -434,7 +437,7 @@ static void sctp_association_destroy(struct sctp_association *asoc)
WARN_ON(atomic_read(&asoc->rmem_alloc));
- kfree(asoc);
+ kfree_rcu(asoc, rcu);
SCTP_DBG_OBJCNT_DEC(assoc);
}
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 7df3704982f5..ebf28adba789 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
return match;
}
+int sctp_bind_addrs_check(struct sctp_sock *sp,
+ struct sctp_sock *sp2, int cnt2)
+{
+ struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr;
+ struct sctp_bind_addr *bp = &sp->ep->base.bind_addr;
+ struct sctp_sockaddr_entry *laddr, *laddr2;
+ bool exist = false;
+ int cnt = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ list_for_each_entry_rcu(laddr2, &bp2->address_list, list) {
+ if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) &&
+ laddr->valid && laddr2->valid) {
+ exist = true;
+ goto next;
+ }
+ }
+ cnt = 0;
+ break;
+next:
+ cnt++;
+ }
+ rcu_read_unlock();
+
+ return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
+}
+
/* Does the address 'addr' conflict with any addresses in
* the bp.
*/
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce8087846f05..64bef313d436 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg)
/* Final destructruction of datamsg memory. */
static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
{
+ struct sctp_association *asoc = NULL;
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
- struct sctp_sock *sp;
struct sctp_ulpevent *ev;
- struct sctp_association *asoc = NULL;
int error = 0, notify;
/* If we failed, we may need to notify. */
@@ -108,9 +107,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
else
error = asoc->outqueue.error;
- sp = sctp_sk(asoc->base.sk);
- notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
- &sp->subscribe);
+ notify = sctp_ulpevent_type_enabled(asoc->subscribe,
+ SCTP_SEND_FAILED);
}
/* Generate a SEND FAILED event only if enabled. */
@@ -191,6 +189,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
* the packet
*/
max_data = asoc->frag_point;
+ if (unlikely(!max_data)) {
+ max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk),
+ sctp_datachk_len(&asoc->stream));
+ pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)",
+ __func__, asoc, max_data);
+ }
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99882ed..d7a649d240e5 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -57,6 +57,7 @@
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
#include <linux/rhashtable.h>
+#include <net/sock_reuseport.h>
/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
@@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
const union sctp_addr *paddr,
const union sctp_addr *laddr,
struct sctp_transport **transportp);
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
- const union sctp_addr *laddr);
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+ struct net *net, struct sk_buff *skb,
+ const union sctp_addr *laddr,
+ const union sctp_addr *daddr);
static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
@@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
if (!asoc)
- ep = __sctp_rcv_lookup_endpoint(net, &dest);
+ ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);
/* Retrieve the common input handling substructure. */
rcvr = asoc ? &asoc->base : &ep->base;
@@ -574,7 +577,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
* is probably better.
*
*/
-void sctp_v4_err(struct sk_buff *skb, __u32 info)
+int sctp_v4_err(struct sk_buff *skb, __u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
const int ihlen = iph->ihl * 4;
@@ -599,7 +602,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
skb->transport_header = savesctp;
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
/* Warning: The sock lock is held. Remember to call
* sctp_err_finish!
@@ -653,6 +656,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
out_unlock:
sctp_err_finish(sk, transport);
+ return 0;
}
/*
@@ -720,43 +724,87 @@ discard:
}
/* Insert endpoint into the hash table. */
-static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
+static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
{
- struct net *net = sock_net(ep->base.sk);
- struct sctp_ep_common *epb;
+ struct sock *sk = ep->base.sk;
+ struct net *net = sock_net(sk);
struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
epb = &ep->base;
-
epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
head = &sctp_ep_hashtable[epb->hashent];
+ if (sk->sk_reuseport) {
+ bool any = sctp_is_ep_boundall(sk);
+ struct sctp_ep_common *epb2;
+ struct list_head *list;
+ int cnt = 0, err = 1;
+
+ list_for_each(list, &ep->base.bind_addr.address_list)
+ cnt++;
+
+ sctp_for_each_hentry(epb2, &head->chain) {
+ struct sock *sk2 = epb2->sk;
+
+ if (!net_eq(sock_net(sk2), net) || sk2 == sk ||
+ !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) ||
+ !sk2->sk_reuseport)
+ continue;
+
+ err = sctp_bind_addrs_check(sctp_sk(sk2),
+ sctp_sk(sk), cnt);
+ if (!err) {
+ err = reuseport_add_sock(sk, sk2, any);
+ if (err)
+ return err;
+ break;
+ } else if (err < 0) {
+ return err;
+ }
+ }
+
+ if (err) {
+ err = reuseport_alloc(sk, any);
+ if (err)
+ return err;
+ }
+ }
+
write_lock(&head->lock);
hlist_add_head(&epb->node, &head->chain);
write_unlock(&head->lock);
+ return 0;
}
/* Add an endpoint to the hash. Local BH-safe. */
-void sctp_hash_endpoint(struct sctp_endpoint *ep)
+int sctp_hash_endpoint(struct sctp_endpoint *ep)
{
+ int err;
+
local_bh_disable();
- __sctp_hash_endpoint(ep);
+ err = __sctp_hash_endpoint(ep);
local_bh_enable();
+
+ return err;
}
/* Remove endpoint from the hash table. */
static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
{
- struct net *net = sock_net(ep->base.sk);
+ struct sock *sk = ep->base.sk;
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
epb = &ep->base;
- epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
+ epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port);
head = &sctp_ep_hashtable[epb->hashent];
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
write_lock(&head->lock);
hlist_del_init(&epb->node);
write_unlock(&head->lock);
@@ -770,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
local_bh_enable();
}
+static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
+ const union sctp_addr *paddr, __u32 seed)
+{
+ __u32 addr;
+
+ if (paddr->sa.sa_family == AF_INET6)
+ addr = jhash(&paddr->v6.sin6_addr, 16, seed);
+ else
+ addr = (__force __u32)paddr->v4.sin_addr.s_addr;
+
+ return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
+ (__force __u32)lport, net_hash_mix(net), seed);
+}
+
/* Look up an endpoint. */
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
- const union sctp_addr *laddr)
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+ struct net *net, struct sk_buff *skb,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr)
{
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
struct sctp_endpoint *ep;
+ struct sock *sk;
+ __be16 lport;
int hash;
- hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
+ lport = laddr->v4.sin_port;
+ hash = sctp_ep_hashfn(net, ntohs(lport));
head = &sctp_ep_hashtable[hash];
read_lock(&head->lock);
sctp_for_each_hentry(epb, &head->chain) {
@@ -791,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
ep = sctp_sk(net->sctp.ctl_sock)->ep;
hit:
+ sk = ep->base.sk;
+ if (sk->sk_reuseport) {
+ __u32 phash = sctp_hashfn(net, lport, paddr, 0);
+
+ sk = reuseport_select_sock(sk, phash, skb,
+ sizeof(struct sctphdr));
+ if (sk)
+ ep = sctp_sk(sk)->ep;
+ }
sctp_endpoint_hold(ep);
read_unlock(&head->lock);
return ep;
@@ -829,35 +905,17 @@ out:
static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
{
const struct sctp_transport *t = data;
- const union sctp_addr *paddr = &t->ipaddr;
- const struct net *net = sock_net(t->asoc->base.sk);
- __be16 lport = htons(t->asoc->base.bind_addr.port);
- __u32 addr;
-
- if (paddr->sa.sa_family == AF_INET6)
- addr = jhash(&paddr->v6.sin6_addr, 16, seed);
- else
- addr = (__force __u32)paddr->v4.sin_addr.s_addr;
- return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
- (__force __u32)lport, net_hash_mix(net), seed);
+ return sctp_hashfn(sock_net(t->asoc->base.sk),
+ htons(t->asoc->base.bind_addr.port),
+ &t->ipaddr, seed);
}
static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
{
const struct sctp_hash_cmp_arg *x = data;
- const union sctp_addr *paddr = x->paddr;
- const struct net *net = x->net;
- __be16 lport = x->lport;
- __u32 addr;
- if (paddr->sa.sa_family == AF_INET6)
- addr = jhash(&paddr->v6.sin6_addr, 16, seed);
- else
- addr = (__force __u32)paddr->v4.sin_addr.s_addr;
-
- return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
- (__force __u32)lport, net_hash_mix(net), seed);
+ return sctp_hashfn(x->net, x->lport, x->paddr, seed);
}
static const struct rhashtable_params sctp_hash_params = {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fc6c5e4bffa5..b9ed271b7ef7 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -101,6 +101,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
if (addr) {
addr->a.v6.sin6_family = AF_INET6;
addr->a.v6.sin6_port = 0;
+ addr->a.v6.sin6_flowinfo = 0;
addr->a.v6.sin6_addr = ifa->addr;
addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
addr->valid = 1;
@@ -138,7 +139,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
};
/* ICMP error handler. */
-static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct inet6_dev *idev;
@@ -147,7 +148,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct sctp_transport *transport;
struct ipv6_pinfo *np;
__u16 saveip, savesctp;
- int err;
+ int err, ret = 0;
struct net *net = dev_net(skb->dev);
idev = in6_dev_get(skb->dev);
@@ -163,6 +164,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
skb->transport_header = savesctp;
if (!sk) {
__ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS);
+ ret = -ENOENT;
goto out;
}
@@ -202,6 +204,8 @@ out_unlock:
out:
if (likely(idev != NULL))
in6_dev_put(idev);
+
+ return ret;
}
static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
diff --git a/net/sctp/output.c b/net/sctp/output.c
index b0e74a3e77ec..025f48e14a91 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
head->truesize += skb->truesize;
head->data_len += skb->len;
head->len += skb->len;
+ refcount_add(skb->truesize, &head->sk->sk_wmem_alloc);
__skb_header_release(skb);
}
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c0817f7a8964..a8c4c33377bc 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
void *arg) { \
int error = 0; \
- enum sctp_event event_type; union sctp_subtype subtype; \
+ enum sctp_event_type event_type; union sctp_subtype subtype; \
enum sctp_state state; \
struct sctp_endpoint *ep; \
\
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4a4fd1971255..f4ac6c592e13 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
asoc->c.sinit_max_instreams, gfp))
goto clean_up;
+ /* Update frag_point when stream_interleave may get changed. */
+ sctp_assoc_update_frag_point(asoc);
+
if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
goto clean_up;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 85d393090238..1d143bc3f73d 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
enum sctp_disposition status,
struct sctp_cmd_seq *commands,
gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands,
/* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */
static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
struct sctp_association *asoc,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
union sctp_subtype subtype,
struct sctp_chunk *chunk,
unsigned int error)
@@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc)
* If you want to understand all of lksctp, this is a
* good place to start.
*/
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
union sctp_subtype subtype, enum sctp_state state,
struct sctp_endpoint *ep, struct sctp_association *asoc,
void *event_arg, gfp_t gfp)
@@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
/*****************************************************************
* This the master state function side effect processing function.
*****************************************************************/
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -1285,7 +1285,7 @@ bail:
********************************************************************/
/* This is the side-effect interpreter. */
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 691d9dc620e3..d239b94aa48c 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -79,7 +79,7 @@ static const struct sctp_sm_table_entry bug = {
const struct sctp_sm_table_entry *sctp_sm_lookup_event(
struct net *net,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype)
{
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf618d1b41fd..f93c3cf9e567 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2230,7 +2230,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (sp->recvrcvinfo)
sctp_ulpevent_read_rcvinfo(event, msg);
/* Check if we allow SCTP_SNDRCVINFO. */
- if (sp->subscribe.sctp_data_io_event)
+ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT))
sctp_ulpevent_read_sndrcvinfo(event, msg);
err = copied;
@@ -2304,22 +2304,33 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk,
static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
unsigned int optlen)
{
+ struct sctp_event_subscribe subscribe;
+ __u8 *sn_type = (__u8 *)&subscribe;
+ struct sctp_sock *sp = sctp_sk(sk);
struct sctp_association *asoc;
- struct sctp_ulpevent *event;
+ int i;
if (optlen > sizeof(struct sctp_event_subscribe))
return -EINVAL;
- if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
+
+ if (copy_from_user(&subscribe, optval, optlen))
return -EFAULT;
+ for (i = 0; i < optlen; i++)
+ sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i,
+ sn_type[i]);
+
+ list_for_each_entry(asoc, &sp->ep->asocs, asocs)
+ asoc->subscribe = sctp_sk(sk)->subscribe;
+
/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
* if there is no data to be sent or retransmit, the stack will
* immediately send up this notification.
*/
- if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT,
- &sctp_sk(sk)->subscribe)) {
- asoc = sctp_id2assoc(sk, 0);
+ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
+ struct sctp_ulpevent *event;
+ asoc = sctp_id2assoc(sk, 0);
if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
event = sctp_ulpevent_make_sender_dry_event(asoc,
GFP_USER | __GFP_NOWARN);
@@ -3324,8 +3335,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
__u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
sizeof(struct sctp_data_chunk);
- min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
- datasize);
+ min_len = sctp_min_frag_point(sp, datasize);
max_len = SCTP_MAX_CHUNK_LEN - datasize;
if (val < min_len || val > max_len)
@@ -4261,6 +4271,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
return 0;
}
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_ulpevent *event;
+ struct sctp_event param;
+ int retval = 0;
+
+ if (optlen < sizeof(param)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ optlen = sizeof(param);
+ if (copy_from_user(&param, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (param.se_type < SCTP_SN_TYPE_BASE ||
+ param.se_type > SCTP_SN_TYPE_MAX) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, param.se_assoc_id);
+ if (!asoc) {
+ sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe,
+ param.se_type, param.se_on);
+ goto out;
+ }
+
+ sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on);
+
+ if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+ if (sctp_outq_is_empty(&asoc->outqueue)) {
+ event = sctp_ulpevent_make_sender_dry_event(asoc,
+ GFP_USER | __GFP_NOWARN);
+ if (!event) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ asoc->stream.si->enqueue_event(&asoc->ulpq, event);
+ }
+ }
+
+out:
+ return retval;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4458,6 +4519,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_REUSE_PORT:
retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
break;
+ case SCTP_EVENT:
+ retval = sctp_setsockopt_event(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -4706,7 +4770,7 @@ static int sctp_init_sock(struct sock *sk)
/* Initialize default event subscriptions. By default, all the
* options are off.
*/
- memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe));
+ sp->subscribe = 0;
/* Default Peer Address Parameters. These defaults can
* be modified via SCTP_PEER_ADDR_PARAMS
@@ -5251,14 +5315,24 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
int __user *optlen)
{
+ struct sctp_event_subscribe subscribe;
+ __u8 *sn_type = (__u8 *)&subscribe;
+ int i;
+
if (len == 0)
return -EINVAL;
if (len > sizeof(struct sctp_event_subscribe))
len = sizeof(struct sctp_event_subscribe);
if (put_user(len, optlen))
return -EFAULT;
- if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
+
+ for (i = 0; i < len; i++)
+ sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe,
+ SCTP_SN_TYPE_BASE + i);
+
+ if (copy_to_user(optval, &subscribe, len))
return -EFAULT;
+
return 0;
}
@@ -7393,6 +7467,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
return 0;
}
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_event param;
+ __u16 subscribe;
+
+ if (len < sizeof(param))
+ return -EINVAL;
+
+ len = sizeof(param);
+ if (copy_from_user(&param, optval, len))
+ return -EFAULT;
+
+ if (param.se_type < SCTP_SN_TYPE_BASE ||
+ param.se_type > SCTP_SN_TYPE_MAX)
+ return -EINVAL;
+
+ asoc = sctp_id2assoc(sk, param.se_assoc_id);
+ subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+ param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &param, len))
+ return -EFAULT;
+
+ return 0;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7591,6 +7696,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_REUSE_PORT:
retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
break;
+ case SCTP_EVENT:
+ retval = sctp_getsockopt_event(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7628,8 +7736,10 @@ static struct sctp_bind_bucket *sctp_bucket_create(
static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
- bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
+ struct sctp_sock *sp = sctp_sk(sk);
+ bool reuse = (sk->sk_reuse || sp->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
+ kuid_t uid = sock_i_uid(sk);
struct sctp_bind_bucket *pp;
unsigned short snum;
int ret;
@@ -7705,7 +7815,10 @@ pp_found:
pr_debug("%s: found a possible match\n", __func__);
- if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
+ if ((pp->fastreuse && reuse &&
+ sk->sk_state != SCTP_SS_LISTENING) ||
+ (pp->fastreuseport && sk->sk_reuseport &&
+ uid_eq(pp->fastuid, uid)))
goto success;
/* Run through the list of sockets bound to the port
@@ -7719,16 +7832,18 @@ pp_found:
* in an endpoint.
*/
sk_for_each_bound(sk2, &pp->owner) {
- struct sctp_endpoint *ep2;
- ep2 = sctp_sk(sk2)->ep;
+ struct sctp_sock *sp2 = sctp_sk(sk2);
+ struct sctp_endpoint *ep2 = sp2->ep;
if (sk == sk2 ||
- (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
- sk2->sk_state != SCTP_SS_LISTENING))
+ (reuse && (sk2->sk_reuse || sp2->reuse) &&
+ sk2->sk_state != SCTP_SS_LISTENING) ||
+ (sk->sk_reuseport && sk2->sk_reuseport &&
+ uid_eq(uid, sock_i_uid(sk2))))
continue;
- if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr,
- sctp_sk(sk2), sctp_sk(sk))) {
+ if (sctp_bind_addr_conflict(&ep2->base.bind_addr,
+ addr, sp2, sp)) {
ret = (long)sk2;
goto fail_unlock;
}
@@ -7751,19 +7866,32 @@ pp_not_found:
pp->fastreuse = 1;
else
pp->fastreuse = 0;
- } else if (pp->fastreuse &&
- (!reuse || sk->sk_state == SCTP_SS_LISTENING))
- pp->fastreuse = 0;
+
+ if (sk->sk_reuseport) {
+ pp->fastreuseport = 1;
+ pp->fastuid = uid;
+ } else {
+ pp->fastreuseport = 0;
+ }
+ } else {
+ if (pp->fastreuse &&
+ (!reuse || sk->sk_state == SCTP_SS_LISTENING))
+ pp->fastreuse = 0;
+
+ if (pp->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid)))
+ pp->fastreuseport = 0;
+ }
/* We are set, so fill up all the data in the hash table
* entry, tie the socket list information with the rest of the
* sockets FIXME: Blurry, NPI (ipg).
*/
success:
- if (!sctp_sk(sk)->bind_hash) {
+ if (!sp->bind_hash) {
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &pp->owner);
- sctp_sk(sk)->bind_hash = pp;
+ sp->bind_hash = pp;
}
ret = 0;
@@ -7836,8 +7964,7 @@ static int sctp_listen_start(struct sock *sk, int backlog)
}
sk->sk_max_ack_backlog = backlog;
- sctp_hash_endpoint(ep);
- return 0;
+ return sctp_hash_endpoint(ep);
}
/*
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 0a78cdf86463..a6bf21579466 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -140,7 +140,7 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
struct sctp_ulpevent *cevent;
- struct sk_buff *pos;
+ struct sk_buff *pos, *loc;
pos = skb_peek_tail(&ulpq->reasm);
if (!pos) {
@@ -166,23 +166,30 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
return;
}
+ loc = NULL;
skb_queue_walk(&ulpq->reasm, pos) {
cevent = sctp_skb2event(pos);
if (event->stream < cevent->stream ||
(event->stream == cevent->stream &&
- MID_lt(event->mid, cevent->mid)))
+ MID_lt(event->mid, cevent->mid))) {
+ loc = pos;
break;
-
+ }
if (event->stream == cevent->stream &&
event->mid == cevent->mid &&
!(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
(event->msg_flags & SCTP_DATA_FIRST_FRAG ||
- event->fsn < cevent->fsn))
+ event->fsn < cevent->fsn)) {
+ loc = pos;
break;
+ }
}
- __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+ if (!loc)
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ else
+ __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event));
}
static struct sctp_ulpevent *sctp_intl_retrieve_partial(
@@ -383,7 +390,7 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
struct sctp_ulpevent *cevent;
- struct sk_buff *pos;
+ struct sk_buff *pos, *loc;
pos = skb_peek_tail(&ulpq->lobby);
if (!pos) {
@@ -403,18 +410,25 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
return;
}
+ loc = NULL;
skb_queue_walk(&ulpq->lobby, pos) {
cevent = (struct sctp_ulpevent *)pos->cb;
- if (cevent->stream > event->stream)
+ if (cevent->stream > event->stream) {
+ loc = pos;
break;
-
+ }
if (cevent->stream == event->stream &&
- MID_lt(event->mid, cevent->mid))
+ MID_lt(event->mid, cevent->mid)) {
+ loc = pos;
break;
+ }
}
- __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+ if (!loc)
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ else
+ __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event));
}
static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
@@ -489,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
sk_incoming_cpu_update(sk);
}
- if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+ if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
if (skb_list)
@@ -980,17 +994,19 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
struct sock *sk = ulpq->asoc->base.sk;
struct sctp_ulpevent *ev = NULL;
- if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
- &sctp_sk(sk)->subscribe))
+ if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
+ SCTP_PARTIAL_DELIVERY_EVENT))
return;
ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
sid, mid, flags, gfp);
if (ev) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
- if (!sctp_sk(sk)->data_ready_signalled) {
- sctp_sk(sk)->data_ready_signalled = 1;
+ if (!sp->data_ready_signalled) {
+ sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
}
}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 331cc734e3db..5dde92101743 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -219,7 +219,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sk_incoming_cpu_update(sk);
}
/* Check if the user wishes to receive this event. */
- if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+ if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
/* If we are in partial delivery mode, post to the lobby until
@@ -1129,16 +1129,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
struct sctp_ulpevent *ev = NULL;
- struct sock *sk;
struct sctp_sock *sp;
+ struct sock *sk;
if (!ulpq->pd_mode)
return;
sk = ulpq->asoc->base.sk;
sp = sctp_sk(sk);
- if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
- &sctp_sk(sk)->subscribe))
+ if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
+ SCTP_PARTIAL_DELIVERY_EVENT))
ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
SCTP_PARTIAL_DELIVERY_ABORTED,
0, 0, 0, gfp);
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5fbaf1901571..c4da4a78d369 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -147,8 +147,14 @@ static int smc_release(struct socket *sock)
sk->sk_shutdown |= SHUTDOWN_MASK;
}
if (smc->clcsock) {
+ if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
+ /* wake up clcsock accept */
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ }
+ mutex_lock(&smc->clcsock_release_lock);
sock_release(smc->clcsock);
smc->clcsock = NULL;
+ mutex_unlock(&smc->clcsock_release_lock);
}
if (smc->use_fallback) {
if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
@@ -205,6 +211,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
spin_lock_init(&smc->conn.send_lock);
sk->sk_prot->hash(sk);
sk_refcnt_debug_inc(sk);
+ mutex_init(&smc->clcsock_release_lock);
return sk;
}
@@ -301,14 +308,17 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
-/* register a new rmb, optionally send confirm_rkey msg to register with peer */
+/* register a new rmb, send confirm_rkey msg to register with peer */
static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
bool conf_rkey)
{
- /* register memory region for new rmb */
- if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
- rmb_desc->regerr = 1;
- return -EFAULT;
+ if (!rmb_desc->wr_reg) {
+ /* register memory region for new rmb */
+ if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
+ rmb_desc->regerr = 1;
+ return -EFAULT;
+ }
+ rmb_desc->wr_reg = 1;
}
if (!conf_rkey)
return 0;
@@ -337,8 +347,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
if (link->llc_confirm_rc)
@@ -365,8 +375,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
}
/* send add link reject message, only one link supported for now */
@@ -535,7 +545,8 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type,
if (rc)
return rc;
/* receive SMC Accept CLC message */
- return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
+ return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
+ CLC_WAIT_TIME);
}
/* setup for RDMA connection of client */
@@ -583,8 +594,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
local_contact);
} else {
- if (!smc->conn.rmb_desc->reused &&
- smc_reg_rmb(link, smc->conn.rmb_desc, true))
+ if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
local_contact);
}
@@ -821,7 +831,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
struct socket *new_clcsock = NULL;
struct sock *lsk = &lsmc->sk;
struct sock *new_sk;
- int rc;
+ int rc = -EINVAL;
release_sock(lsk);
new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
@@ -834,7 +844,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
}
*new_smc = smc_sk(new_sk);
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ mutex_lock(&lsmc->clcsock_release_lock);
+ if (lsmc->clcsock)
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
if (rc < 0)
lsk->sk_err = -rc;
@@ -968,8 +981,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
if (link->llc_confirm_resp_rc)
@@ -989,8 +1002,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
}
smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
@@ -1145,10 +1158,8 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
if (local_contact != SMC_FIRST_CONTACT) {
- if (!new_smc->conn.rmb_desc->reused) {
- if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
- return SMC_CLC_DECL_ERR_REGRMB;
- }
+ if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
+ return SMC_CLC_DECL_ERR_REGRMB;
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -1184,7 +1195,6 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
return 0;
decline:
- mutex_unlock(&smc_create_lgr_pending);
smc_listen_decline(new_smc, reason_code, local_contact);
return reason_code;
}
@@ -1225,7 +1235,7 @@ static void smc_listen_work(struct work_struct *work)
*/
pclc = (struct smc_clc_msg_proposal *)&buf;
reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
- SMC_CLC_PROPOSAL);
+ SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
if (reason_code) {
smc_listen_decline(new_smc, reason_code, 0);
return;
@@ -1275,7 +1285,7 @@ static void smc_listen_work(struct work_struct *work)
/* receive SMC Confirm CLC message */
reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
- SMC_CLC_CONFIRM);
+ SMC_CLC_CONFIRM, CLC_WAIT_TIME);
if (reason_code) {
mutex_unlock(&smc_create_lgr_pending);
smc_listen_decline(new_smc, reason_code, local_contact);
@@ -1284,8 +1294,10 @@ static void smc_listen_work(struct work_struct *work)
/* finish worker */
if (!ism_supported) {
- if (smc_listen_rdma_finish(new_smc, &cclc, local_contact))
+ if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) {
+ mutex_unlock(&smc_create_lgr_pending);
return;
+ }
}
smc_conn_save_peer_info(new_smc, &cclc);
mutex_unlock(&smc_create_lgr_pending);
@@ -1357,7 +1369,6 @@ static int smc_listen(struct socket *sock, int backlog)
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
- INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
sock_hold(sk); /* sock_hold in tcp_listen_worker */
if (!schedule_work(&smc->tcp_listen_work))
sock_put(sk);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 08786ace6010..5721416d0605 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -219,6 +219,10 @@ struct smc_sock { /* smc sock container */
* started, waiting for unsent
* data to be sent
*/
+ struct mutex clcsock_release_lock;
+ /* protects clcsock of a listen
+ * socket
+ * */
};
static inline struct smc_sock *smc_sk(const struct sock *sk)
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 89c3a8c7859a..776e9dfc915d 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -265,7 +265,7 @@ out:
* clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
*/
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
- u8 expected_type)
+ u8 expected_type, unsigned long timeout)
{
long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
struct sock *clc_sk = smc->clcsock->sk;
@@ -285,7 +285,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
* sizeof(struct smc_clc_msg_hdr)
*/
krflags = MSG_PEEK | MSG_WAITALL;
- smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ clc_sk->sk_rcvtimeo = timeout;
iov_iter_kvec(&msg.msg_iter, READ, &vec, 1,
sizeof(struct smc_clc_msg_hdr));
len = sock_recvmsg(smc->clcsock, &msg, krflags);
@@ -297,7 +297,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
if (clc_sk->sk_err) {
reason_code = -clc_sk->sk_err;
- smc->sk.sk_err = clc_sk->sk_err;
+ if (clc_sk->sk_err == EAGAIN &&
+ expected_type == SMC_CLC_DECLINE)
+ clc_sk->sk_err = 0; /* reset for fallback usage */
+ else
+ smc->sk.sk_err = clc_sk->sk_err;
goto out;
}
if (!len) { /* peer has performed orderly shutdown */
@@ -306,7 +310,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
goto out;
}
if (len < 0) {
- smc->sk.sk_err = -len;
+ if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE)
+ smc->sk.sk_err = -len;
reason_code = len;
goto out;
}
@@ -346,7 +351,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
out:
- smc->clcsock->sk->sk_rcvtimeo = rcvtimeo;
+ clc_sk->sk_rcvtimeo = rcvtimeo;
return reason_code;
}
@@ -374,10 +379,8 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
sizeof(struct smc_clc_msg_decline));
if (len < sizeof(struct smc_clc_msg_decline))
- smc->sk.sk_err = EPROTO;
- if (len < 0)
- smc->sk.sk_err = -len;
- return sock_error(&smc->sk);
+ len = -EPROTO;
+ return len > 0 ? 0 : len;
}
/* send CLC PROPOSAL message across internal TCP socket */
@@ -536,7 +539,6 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
struct smc_link *link;
struct msghdr msg;
struct kvec vec;
- int rc = 0;
int len;
memset(&aclc, 0, sizeof(aclc));
@@ -589,13 +591,8 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
vec.iov_len = ntohs(aclc.hdr.length);
len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
ntohs(aclc.hdr.length));
- if (len < ntohs(aclc.hdr.length)) {
- if (len >= 0)
- new_smc->sk.sk_err = EPROTO;
- else
- new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
- rc = sock_error(&new_smc->sk);
- }
+ if (len < ntohs(aclc.hdr.length))
+ len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
- return rc;
+ return len > 0 ? 0 : len;
}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 18da89b681c2..24658e8c0de4 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -27,6 +27,7 @@
#define SMC_TYPE_D 1 /* SMC-D only */
#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
@@ -182,7 +183,7 @@ struct smcd_dev;
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
- u8 expected_type);
+ u8 expected_type, unsigned long timeout);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
struct smc_ib_device *smcibdev, u8 ibport, u8 gid[],
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1c9fa7f0261a..35c1cdc93e1c 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -149,6 +149,8 @@ static int smc_link_send_delete(struct smc_link *lnk)
return -ENOTCONN;
}
+static void smc_lgr_free(struct smc_link_group *lgr);
+
static void smc_lgr_free_work(struct work_struct *work)
{
struct smc_link_group *lgr = container_of(to_delayed_work(work),
@@ -171,8 +173,11 @@ free:
spin_unlock_bh(&smc_lgr_list.lock);
if (!lgr->is_smcd && !lgr->terminating) {
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
/* try to send del link msg, on error free lgr immediately */
- if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) {
+ if (lnk->state == SMC_LNK_ACTIVE &&
+ !smc_link_send_delete(lnk)) {
/* reschedule in case we never receive a response */
smc_lgr_schedule_free_work(lgr);
return;
@@ -295,8 +300,13 @@ static void smc_buf_unuse(struct smc_connection *conn,
conn->sndbuf_desc->used = 0;
if (conn->rmb_desc) {
if (!conn->rmb_desc->regerr) {
- conn->rmb_desc->reused = 1;
conn->rmb_desc->used = 0;
+ if (!lgr->is_smcd) {
+ /* unregister rmb with peer */
+ smc_llc_do_delete_rkey(
+ &lgr->lnk[SMC_SINGLE_LINK],
+ conn->rmb_desc);
+ }
} else {
/* buf registration failed, reuse not possible */
write_lock_bh(&lgr->rmbs_lock);
@@ -410,7 +420,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
}
/* remove a link group */
-void smc_lgr_free(struct smc_link_group *lgr)
+static void smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
if (lgr->is_smcd)
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index cf98f4d6093e..b00287989a3d 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -109,6 +109,9 @@ struct smc_link {
int llc_testlink_time; /* testlink interval */
struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */
int llc_confirm_rkey_rc; /* rc from cnf rkey msg */
+ struct completion llc_delete_rkey; /* wait 4 rx of del rkey */
+ int llc_delete_rkey_rc; /* rc from del rkey msg */
+ struct mutex llc_delete_rkey_mutex; /* serialize usage */
};
/* For now we just allow one parallel link per link group. The SMC protocol
@@ -127,7 +130,7 @@ struct smc_buf_desc {
struct page *pages;
int len; /* length of buffer */
u32 used; /* currently used / unused */
- u8 reused : 1; /* new created / reused */
+ u8 wr_reg : 1; /* mem region registered */
u8 regerr : 1; /* err during registration */
union {
struct { /* SMC-R */
@@ -243,7 +246,6 @@ struct smc_sock;
struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
-void smc_lgr_free(struct smc_link_group *lgr);
void smc_lgr_forget(struct smc_link_group *lgr);
void smc_lgr_terminate(struct smc_link_group *lgr);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 9c916c709ca7..a6d3623d06f4 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -238,6 +238,29 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
return rc;
}
+/* send LLC delete rkey request */
+static int smc_llc_send_delete_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_msg_delete_rkey *rkeyllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ return rc;
+ rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
+ rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
+ rkeyllc->num_rkeys = 1;
+ rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+ return rc;
+}
+
/* prepare an add link message */
static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
struct smc_link *link, u8 mac[], u8 gid[],
@@ -509,7 +532,9 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
int i, max;
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- /* unused as long as we don't send this type of msg */
+ link->llc_delete_rkey_rc = llc->hd.flags &
+ SMC_LLC_FLAG_RKEY_NEG;
+ complete(&link->llc_delete_rkey);
} else {
max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
for (i = 0; i < max; i++) {
@@ -610,6 +635,8 @@ int smc_llc_link_init(struct smc_link *link)
init_completion(&link->llc_add);
init_completion(&link->llc_add_resp);
init_completion(&link->llc_confirm_rkey);
+ init_completion(&link->llc_delete_rkey);
+ mutex_init(&link->llc_delete_rkey_mutex);
init_completion(&link->llc_testlink_resp);
INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
return 0;
@@ -650,8 +677,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
{
int rc;
+ /* protected by mutex smc_create_lgr_pending */
reinit_completion(&link->llc_confirm_rkey);
- smc_llc_send_confirm_rkey(link, rmb_desc);
+ rc = smc_llc_send_confirm_rkey(link, rmb_desc);
+ if (rc)
+ return rc;
/* receive CONFIRM RKEY response from server over RoCE fabric */
rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
SMC_LLC_WAIT_TIME);
@@ -660,6 +690,29 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
return 0;
}
+/* unregister an rtoken at the remote peer */
+int smc_llc_do_delete_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ int rc;
+
+ mutex_lock(&link->llc_delete_rkey_mutex);
+ reinit_completion(&link->llc_delete_rkey);
+ rc = smc_llc_send_delete_rkey(link, rmb_desc);
+ if (rc)
+ goto out;
+ /* receive DELETE RKEY response from server over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey,
+ SMC_LLC_WAIT_TIME);
+ if (rc <= 0 || link->llc_delete_rkey_rc)
+ rc = -EFAULT;
+ else
+ rc = 0;
+out:
+ mutex_unlock(&link->llc_delete_rkey_mutex);
+ return rc;
+}
+
/***************************** init, exit, misc ******************************/
static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 9e2ff088e301..461c0c3ef76e 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -49,6 +49,8 @@ void smc_llc_link_inactive(struct smc_link *link);
void smc_llc_link_clear(struct smc_link *link);
int smc_llc_do_confirm_rkey(struct smc_link *link,
struct smc_buf_desc *rmb_desc);
+int smc_llc_do_delete_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc);
int smc_llc_init(void) __init;
#endif /* SMC_LLC_H */
diff --git a/net/socket.c b/net/socket.c
index 334fcc617ef2..e89884e2197b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2341,8 +2341,9 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
* Linux recvmmsg interface
*/
-int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
- unsigned int flags, struct timespec64 *timeout)
+static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
+ unsigned int vlen, unsigned int flags,
+ struct timespec64 *timeout)
{
int fput_needed, err, datagrams;
struct socket *sock;
@@ -2451,25 +2452,32 @@ out_put:
return datagrams;
}
-static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
- unsigned int vlen, unsigned int flags,
- struct __kernel_timespec __user *timeout)
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
+ unsigned int vlen, unsigned int flags,
+ struct __kernel_timespec __user *timeout,
+ struct old_timespec32 __user *timeout32)
{
int datagrams;
struct timespec64 timeout_sys;
- if (flags & MSG_CMSG_COMPAT)
- return -EINVAL;
-
- if (!timeout)
- return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+ if (timeout && get_timespec64(&timeout_sys, timeout))
+ return -EFAULT;
- if (get_timespec64(&timeout_sys, timeout))
+ if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
return -EFAULT;
- datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+ if (!timeout && !timeout32)
+ return do_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+ datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
- if (datagrams > 0 && put_timespec64(&timeout_sys, timeout))
+ if (datagrams <= 0)
+ return datagrams;
+
+ if (timeout && put_timespec64(&timeout_sys, timeout))
+ datagrams = -EFAULT;
+
+ if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
datagrams = -EFAULT;
return datagrams;
@@ -2479,8 +2487,23 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags,
struct __kernel_timespec __user *, timeout)
{
- return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+
+ return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct old_timespec32 __user *, timeout)
+{
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+
+ return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
}
+#endif
#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
@@ -2600,8 +2623,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
a[2], true);
break;
case SYS_RECVMMSG:
- err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2],
- a[3], (struct __kernel_timespec __user *)a[4]);
+ if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
+ err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
+ a[2], a[3],
+ (struct __kernel_timespec __user *)a[4],
+ NULL);
+ else
+ err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
+ a[2], a[3], NULL,
+ (struct old_timespec32 __user *)a[4]);
break;
case SYS_ACCEPT4:
err = __sys_accept4(a0, (struct sockaddr __user *)a1,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 5d3f252659f1..ba765473d1f0 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1791,6 +1791,7 @@ priv_release_snd_buf(struct rpc_rqst *rqstp)
for (i=0; i < rqstp->rq_enc_pages_num; i++)
__free_page(rqstp->rq_enc_pages[i]);
kfree(rqstp->rq_enc_pages);
+ rqstp->rq_release_snd_buf = NULL;
}
static int
@@ -1799,6 +1800,9 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
int first, last, i;
+ if (rqstp->rq_release_snd_buf)
+ rqstp->rq_release_snd_buf(rqstp);
+
if (snd_buf->page_len == 0) {
rqstp->rq_enc_pages_num = 0;
return 0;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index ae3b8145da35..24cbddc44c88 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1915,6 +1915,13 @@ call_connect_status(struct rpc_task *task)
struct rpc_clnt *clnt = task->tk_client;
int status = task->tk_status;
+ /* Check if the task was already transmitted */
+ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
+ xprt_end_transmit(task);
+ task->tk_action = call_transmit_status;
+ return;
+ }
+
dprint_status(task);
trace_rpc_connect_status(task);
@@ -1945,6 +1952,7 @@ call_connect_status(struct rpc_task *task)
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
/* fall through */
+ case -ENOTCONN:
case -EAGAIN:
/* Check for timeouts before looping back to call_bind */
case -ETIMEDOUT:
@@ -2302,6 +2310,7 @@ out_retry:
task->tk_status = 0;
/* Note: rpc_verify_header() may have freed the RPC slot */
if (task->tk_rqstp == req) {
+ xdr_free_bvec(&req->rq_rcv_buf);
req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
if (task->tk_client->cl_discrtry)
xprt_conditional_disconnect(req->rq_xprt,
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 9062967575c4..7e55cfc69697 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -175,7 +175,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
return -1;
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
+ netdev_rx_csum_fault(skb->dev, skb);
return 0;
no_checksum:
if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 86bea4520c4d..73547d17d3c6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -67,7 +67,6 @@
*/
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
-static void xprt_connect_status(struct rpc_task *task);
static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
@@ -680,7 +679,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- xprt_wake_pending_tasks(xprt, -EAGAIN);
+ else if (xprt->snd_task)
+ rpc_wake_up_queued_task_set_status(&xprt->pending,
+ xprt->snd_task, -ENOTCONN);
spin_unlock_bh(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_force_disconnect);
@@ -820,46 +821,25 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_connected(xprt)) {
task->tk_timeout = task->tk_rqstp->rq_timeout;
task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
- rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
+ rpc_sleep_on(&xprt->pending, task, NULL);
if (test_bit(XPRT_CLOSING, &xprt->state))
return;
if (xprt_test_and_set_connecting(xprt))
return;
- xprt->stat.connect_start = jiffies;
- xprt->ops->connect(xprt, task);
+ /* Race breaker */
+ if (!xprt_connected(xprt)) {
+ xprt->stat.connect_start = jiffies;
+ xprt->ops->connect(xprt, task);
+ } else {
+ xprt_clear_connecting(xprt);
+ task->tk_status = 0;
+ rpc_wake_up_queued_task(&xprt->pending, task);
+ }
}
xprt_release_write(xprt, task);
}
-static void xprt_connect_status(struct rpc_task *task)
-{
- switch (task->tk_status) {
- case 0:
- dprintk("RPC: %5u xprt_connect_status: connection established\n",
- task->tk_pid);
- break;
- case -ECONNREFUSED:
- case -ECONNRESET:
- case -ECONNABORTED:
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -EPIPE:
- case -EAGAIN:
- dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
- break;
- case -ETIMEDOUT:
- dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
- "out\n", task->tk_pid);
- break;
- default:
- dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
- "server %s\n", task->tk_pid, -task->tk_status,
- task->tk_rqstp->rq_xprt->servername);
- task->tk_status = -EIO;
- }
-}
-
enum xprt_xid_rb_cmp {
XID_RB_EQUAL,
XID_RB_LEFT,
@@ -1623,6 +1603,8 @@ xprt_request_init(struct rpc_task *task)
req->rq_snd_buf.buflen = 0;
req->rq_rcv_buf.len = 0;
req->rq_rcv_buf.buflen = 0;
+ req->rq_snd_buf.bvec = NULL;
+ req->rq_rcv_buf.bvec = NULL;
req->rq_release_snd_buf = NULL;
xprt_reset_majortimeo(req);
dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ae77c71c1f64..f0b3700cec95 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -330,18 +330,16 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
{
size_t i,n;
- if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+ if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES))
return want;
- if (want > buf->page_len)
- want = buf->page_len;
n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < n; i++) {
if (buf->pages[i])
continue;
buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp);
if (!buf->pages[i]) {
- buf->page_len = (i * PAGE_SIZE) - buf->page_base;
- return buf->page_len;
+ i *= PAGE_SIZE;
+ return i > buf->page_base ? i - buf->page_base : 0;
}
}
return want;
@@ -378,8 +376,8 @@ static ssize_t
xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
size_t count)
{
- struct kvec kvec = { 0 };
- return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0);
+ iov_iter_discard(&msg->msg_iter, READ, count);
+ return sock_recvmsg(sock, msg, flags);
}
static ssize_t
@@ -398,16 +396,17 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
- goto eagain;
+ goto out;
seek = 0;
} else {
seek -= buf->head[0].iov_len;
offset += buf->head[0].iov_len;
}
- if (seek < buf->page_len) {
- want = xs_alloc_sparse_pages(buf,
- min_t(size_t, count - offset, buf->page_len),
- GFP_NOWAIT);
+
+ want = xs_alloc_sparse_pages(buf,
+ min_t(size_t, count - offset, buf->page_len),
+ GFP_NOWAIT);
+ if (seek < want) {
ret = xs_read_bvec(sock, msg, flags, buf->bvec,
xdr_buf_pagecount(buf),
want + buf->page_base,
@@ -418,12 +417,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
- goto eagain;
+ goto out;
seek = 0;
} else {
- seek -= buf->page_len;
- offset += buf->page_len;
+ seek -= want;
+ offset += want;
}
+
if (seek < buf->tail[0].iov_len) {
want = min_t(size_t, count - offset, buf->tail[0].iov_len);
ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek);
@@ -433,17 +433,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
- goto eagain;
+ goto out;
} else
offset += buf->tail[0].iov_len;
ret = -EMSGSIZE;
- msg->msg_flags |= MSG_TRUNC;
out:
*read = offset - seek_init;
return ret;
-eagain:
- ret = -EAGAIN;
- goto out;
sock_err:
offset += seek;
goto out;
@@ -486,19 +482,20 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
if (transport->recv.offset == transport->recv.len) {
if (xs_read_stream_request_done(transport))
msg->msg_flags |= MSG_EOR;
- return transport->recv.copied;
+ return read;
}
switch (ret) {
+ default:
+ break;
+ case -EFAULT:
case -EMSGSIZE:
- return transport->recv.copied;
+ msg->msg_flags |= MSG_TRUNC;
+ return read;
case 0:
return -ESHUTDOWN;
- default:
- if (ret < 0)
- return ret;
}
- return -EAGAIN;
+ return ret < 0 ? ret : read;
}
static size_t
@@ -537,7 +534,7 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
ret = xs_read_stream_request(transport, msg, flags, req);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
- xprt_complete_bc_request(req, ret);
+ xprt_complete_bc_request(req, transport->recv.copied);
return ret;
}
@@ -570,7 +567,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
spin_lock(&xprt->queue_lock);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
- xprt_complete_rqst(req->rq_task, ret);
+ xprt_complete_rqst(req->rq_task, transport->recv.copied);
xprt_unpin_rqst(req);
out:
spin_unlock(&xprt->queue_lock);
@@ -591,10 +588,8 @@ xs_read_stream(struct sock_xprt *transport, int flags)
if (ret <= 0)
goto out_err;
transport->recv.offset = ret;
- if (ret != want) {
- ret = -EAGAIN;
- goto out_err;
- }
+ if (transport->recv.offset != want)
+ return transport->recv.offset;
transport->recv.len = be32_to_cpu(transport->recv.fraghdr) &
RPC_FRAGMENT_SIZE_MASK;
transport->recv.offset -= sizeof(transport->recv.fraghdr);
@@ -602,6 +597,9 @@ xs_read_stream(struct sock_xprt *transport, int flags)
}
switch (be32_to_cpu(transport->recv.calldir)) {
+ default:
+ msg.msg_flags |= MSG_TRUNC;
+ break;
case RPC_CALL:
ret = xs_read_stream_call(transport, &msg, flags);
break;
@@ -616,6 +614,9 @@ xs_read_stream(struct sock_xprt *transport, int flags)
goto out_err;
read += ret;
if (transport->recv.offset < transport->recv.len) {
+ if (!(msg.msg_flags & MSG_TRUNC))
+ return read;
+ msg.msg_flags = 0;
ret = xs_read_discard(transport->sock, &msg, flags,
transport->recv.len - transport->recv.offset);
if (ret <= 0)
@@ -623,7 +624,7 @@ xs_read_stream(struct sock_xprt *transport, int flags)
transport->recv.offset += ret;
read += ret;
if (transport->recv.offset != transport->recv.len)
- return -EAGAIN;
+ return read;
}
if (xs_read_stream_request_done(transport)) {
trace_xs_stream_read_request(transport);
@@ -633,13 +634,7 @@ xs_read_stream(struct sock_xprt *transport, int flags)
transport->recv.len = 0;
return read;
out_err:
- switch (ret) {
- case 0:
- case -ESHUTDOWN:
- xprt_force_disconnect(&transport->xprt);
- return -ESHUTDOWN;
- }
- return ret;
+ return ret != 0 ? ret : -ESHUTDOWN;
}
static void xs_stream_data_receive(struct sock_xprt *transport)
@@ -648,12 +643,12 @@ static void xs_stream_data_receive(struct sock_xprt *transport)
ssize_t ret = 0;
mutex_lock(&transport->recv_mutex);
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
if (transport->sock == NULL)
goto out;
- clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
for (;;) {
ret = xs_read_stream(transport, MSG_DONTWAIT);
- if (ret <= 0)
+ if (ret < 0)
break;
read += ret;
cond_resched();
@@ -1222,6 +1217,8 @@ static void xs_reset_transport(struct sock_xprt *transport)
trace_rpc_socket_close(xprt, sock);
sock_release(sock);
+
+ xprt_disconnect_done(xprt);
}
/**
@@ -1242,8 +1239,6 @@ static void xs_close(struct rpc_xprt *xprt)
xs_reset_transport(transport);
xprt->reestablish_timeout = 0;
-
- xprt_disconnect_done(xprt);
}
static void xs_inject_disconnect(struct rpc_xprt *xprt)
@@ -1345,10 +1340,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
int err;
mutex_lock(&transport->recv_mutex);
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
sk = transport->inet;
if (sk == NULL)
goto out;
- clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
for (;;) {
skb = skb_recv_udp(sk, 0, 1, &err);
if (skb == NULL)
@@ -1494,8 +1489,6 @@ static void xs_tcp_state_change(struct sock *sk)
&transport->sock_state))
xprt_clear_connecting(xprt);
clear_bit(XPRT_CLOSING, &xprt->state);
- if (sk->sk_err)
- xprt_wake_pending_tasks(xprt, -sk->sk_err);
/* Trigger the socket release */
xs_tcp_force_close(xprt);
}
@@ -2097,8 +2090,8 @@ static void xs_udp_setup_socket(struct work_struct *work)
trace_rpc_socket_connect(xprt, sock, 0);
status = 0;
out:
- xprt_unlock_connect(xprt, transport);
xprt_clear_connecting(xprt);
+ xprt_unlock_connect(xprt, transport);
xprt_wake_pending_tasks(xprt, status);
}
@@ -2334,8 +2327,8 @@ static void xs_tcp_setup_socket(struct work_struct *work)
}
status = -EAGAIN;
out:
- xprt_unlock_connect(xprt, transport);
xprt_clear_connecting(xprt);
+ xprt_unlock_connect(xprt, transport);
xprt_wake_pending_tasks(xprt, status);
}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 74b9d916a58b..5df9d1138ac9 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -353,34 +353,35 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
return 0;
}
-static int __switchdev_port_obj_add(struct net_device *dev,
- const struct switchdev_obj *obj,
- struct switchdev_trans *trans)
+static int switchdev_port_obj_notify(enum switchdev_notifier_type nt,
+ struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans,
+ struct netlink_ext_ack *extack)
{
- const struct switchdev_ops *ops = dev->switchdev_ops;
- struct net_device *lower_dev;
- struct list_head *iter;
- int err = -EOPNOTSUPP;
-
- if (ops && ops->switchdev_port_obj_add)
- return ops->switchdev_port_obj_add(dev, obj, trans);
+ int rc;
+ int err;
- /* Switch device port(s) may be stacked under
- * bond/team/vlan dev, so recurse down to add object on
- * each port.
- */
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .obj = obj,
+ .trans = trans,
+ .handled = false,
+ };
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = __switchdev_port_obj_add(lower_dev, obj, trans);
- if (err)
- break;
+ rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack);
+ err = notifier_to_errno(rc);
+ if (err) {
+ WARN_ON(!obj_info.handled);
+ return err;
}
-
- return err;
+ if (!obj_info.handled)
+ return -EOPNOTSUPP;
+ return 0;
}
static int switchdev_port_obj_add_now(struct net_device *dev,
- const struct switchdev_obj *obj)
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
{
struct switchdev_trans trans;
int err;
@@ -397,7 +398,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev,
*/
trans.ph_prepare = true;
- err = __switchdev_port_obj_add(dev, obj, &trans);
+ err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD,
+ dev, obj, &trans, extack);
if (err) {
/* Prepare phase failed: abort the transaction. Any
* resources reserved in the prepare phase are
@@ -416,7 +418,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev,
*/
trans.ph_prepare = false;
- err = __switchdev_port_obj_add(dev, obj, &trans);
+ err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD,
+ dev, obj, &trans, extack);
WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id);
switchdev_trans_items_warn_destroy(dev, &trans);
@@ -429,7 +432,7 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev,
const struct switchdev_obj *obj = data;
int err;
- err = switchdev_port_obj_add_now(dev, obj);
+ err = switchdev_port_obj_add_now(dev, obj, NULL);
if (err && err != -EOPNOTSUPP)
netdev_err(dev, "failed (err=%d) to add object (id=%d)\n",
err, obj->id);
@@ -459,38 +462,21 @@ static int switchdev_port_obj_add_defer(struct net_device *dev,
* in case SWITCHDEV_F_DEFER flag is not set.
*/
int switchdev_port_obj_add(struct net_device *dev,
- const struct switchdev_obj *obj)
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
{
if (obj->flags & SWITCHDEV_F_DEFER)
return switchdev_port_obj_add_defer(dev, obj);
ASSERT_RTNL();
- return switchdev_port_obj_add_now(dev, obj);
+ return switchdev_port_obj_add_now(dev, obj, extack);
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_add);
static int switchdev_port_obj_del_now(struct net_device *dev,
const struct switchdev_obj *obj)
{
- const struct switchdev_ops *ops = dev->switchdev_ops;
- struct net_device *lower_dev;
- struct list_head *iter;
- int err = -EOPNOTSUPP;
-
- if (ops && ops->switchdev_port_obj_del)
- return ops->switchdev_port_obj_del(dev, obj);
-
- /* Switch device port(s) may be stacked under
- * bond/team/vlan dev, so recurse down to delete object on
- * each port.
- */
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = switchdev_port_obj_del_now(lower_dev, obj);
- if (err)
- break;
- }
-
- return err;
+ return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL,
+ dev, obj, NULL, NULL);
}
static void switchdev_port_obj_del_deferred(struct net_device *dev,
@@ -535,6 +521,7 @@ int switchdev_port_obj_del(struct net_device *dev,
EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
+static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
/**
* register_switchdev_notifier - Register notifier
@@ -572,10 +559,38 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
struct switchdev_notifier_info *info)
{
info->dev = dev;
+ info->extack = NULL;
return atomic_notifier_call_chain(&switchdev_notif_chain, val, info);
}
EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
+int register_switchdev_blocking_notifier(struct notifier_block *nb)
+{
+ struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
+
+ return blocking_notifier_chain_register(chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier);
+
+int unregister_switchdev_blocking_notifier(struct notifier_block *nb)
+{
+ struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
+
+ return blocking_notifier_chain_unregister(chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier);
+
+int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev,
+ struct switchdev_notifier_info *info,
+ struct netlink_ext_ack *extack)
+{
+ info->dev = dev;
+ info->extack = extack;
+ return blocking_notifier_call_chain(&switchdev_blocking_notif_chain,
+ val, info);
+}
+EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers);
+
bool switchdev_port_same_parent_id(struct net_device *a,
struct net_device *b)
{
@@ -595,3 +610,109 @@ bool switchdev_port_same_parent_id(struct net_device *a,
return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
}
EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
+
+static int __switchdev_handle_port_obj_add(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ int (*add_cb)(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans,
+ struct netlink_ext_ack *extack))
+{
+ struct netlink_ext_ack *extack;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ extack = switchdev_notifier_info_to_extack(&port_obj_info->info);
+
+ if (check_cb(dev)) {
+ /* This flag is only checked if the return value is success. */
+ port_obj_info->handled = true;
+ return add_cb(dev, port_obj_info->obj, port_obj_info->trans,
+ extack);
+ }
+
+ /* Switch ports might be stacked under e.g. a LAG. Ignore the
+ * unsupported devices, another driver might be able to handle them. But
+ * propagate to the callers any hard errors.
+ *
+ * If the driver does its own bookkeeping of stacked ports, it's not
+ * necessary to go through this helper.
+ */
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info,
+ check_cb, add_cb);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ return err;
+}
+
+int switchdev_handle_port_obj_add(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ int (*add_cb)(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans,
+ struct netlink_ext_ack *extack))
+{
+ int err;
+
+ err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb,
+ add_cb);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add);
+
+static int __switchdev_handle_port_obj_del(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ int (*del_cb)(struct net_device *dev,
+ const struct switchdev_obj *obj))
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ if (check_cb(dev)) {
+ /* This flag is only checked if the return value is success. */
+ port_obj_info->handled = true;
+ return del_cb(dev, port_obj_info->obj);
+ }
+
+ /* Switch ports might be stacked under e.g. a LAG. Ignore the
+ * unsupported devices, another driver might be able to handle them. But
+ * propagate to the callers any hard errors.
+ *
+ * If the driver does its own bookkeeping of stacked ports, it's not
+ * necessary to go through this helper.
+ */
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info,
+ check_cb, del_cb);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ return err;
+}
+
+int switchdev_handle_port_obj_del(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ int (*del_cb)(struct net_device *dev,
+ const struct switchdev_obj *obj))
+{
+ int err;
+
+ err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb,
+ del_cb);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del);
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index aca168f2abb1..c86aba0282af 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -9,7 +9,9 @@ tipc-y += addr.o bcast.o bearer.o \
core.o link.o discover.o msg.o \
name_distr.o subscr.o monitor.o name_table.o net.o \
netlink.o netlink_compat.o node.o socket.o eth_media.o \
- topsrv.o socket.o group.o
+ topsrv.o socket.o group.o trace.o
+
+CFLAGS_trace.o += -I$(src)
tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o
tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index e65c3a8551e4..fb2c0d8f359f 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -43,6 +43,7 @@
#include "bcast.h"
#include "netlink.h"
#include "udp_media.h"
+#include "trace.h"
#define MAX_ADDR_STR 60
@@ -99,7 +100,7 @@ static struct tipc_media *media_find_id(u8 type)
/**
* tipc_media_addr_printf - record media address in print buffer
*/
-void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
+int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
{
char addr_str[MAX_ADDR_STR];
struct tipc_media *m;
@@ -114,9 +115,10 @@ void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
ret = scnprintf(buf, len, "UNKNOWN(%u)", a->media_id);
for (i = 0; i < sizeof(a->value); i++)
- ret += scnprintf(buf - ret, len + ret,
- "-%02x", a->value[i]);
+ ret += scnprintf(buf + ret, len - ret,
+ "-%x", a->value[i]);
}
+ return ret;
}
/**
@@ -607,6 +609,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
if (!b)
return NOTIFY_DONE;
+ trace_tipc_l2_device_event(dev, b, evt);
switch (evt) {
case NETDEV_CHANGE:
if (netif_carrier_ok(dev) && netif_oper_up(dev)) {
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 394290cbbb1d..7f4c569594a5 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -207,7 +207,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info);
int tipc_media_set_priority(const char *name, u32 new_value);
int tipc_media_set_window(const char *name, u32 new_value);
-void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a);
+int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a);
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
struct nlattr *attrs[]);
void tipc_disable_l2_media(struct tipc_bearer *b);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 836727e363c4..2792a3cae682 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -43,6 +43,7 @@
#include "discover.h"
#include "netlink.h"
#include "monitor.h"
+#include "trace.h"
#include <linux/pkt_sched.h>
@@ -105,7 +106,7 @@ struct tipc_stats {
* @transmitq: queue for sent, non-acked messages
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
- * @last_retransmitted: sequence number of most recently retransmitted message
+ * @prev_from: sequence number of most previous retransmission request
* @stale_cnt: counter for number of identical retransmit attempts
* @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
@@ -163,7 +164,7 @@ struct tipc_link {
u16 limit;
} backlog[5];
u16 snd_nxt;
- u16 last_retransm;
+ u16 prev_from;
u16 window;
u16 stale_cnt;
unsigned long stale_limit;
@@ -186,9 +187,6 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
- unsigned long prev_retr;
- u16 prev_from;
- u16 prev_to;
u8 nack_state;
bool bc_peer_is_up;
@@ -210,7 +208,7 @@ enum {
BC_NACK_SND_SUPPRESS,
};
-#define TIPC_BC_RETR_LIMIT 10 /* [ms] */
+#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */
/*
* Interval between NACKs when packets arrive out of order
@@ -359,9 +357,11 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
rcv_l->bc_peer_is_up = true;
rcv_l->state = LINK_ESTABLISHED;
tipc_link_bc_ack_rcv(rcv_l, ack, xmitq);
+ trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!");
tipc_link_reset(rcv_l);
rcv_l->state = LINK_RESET;
if (!snd_l->ackers) {
+ trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!");
tipc_link_reset(snd_l);
snd_l->state = LINK_RESET;
__skb_queue_purge(xmitq);
@@ -525,6 +525,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
l = *link;
strcpy(l->name, tipc_bclink_name);
+ trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!");
tipc_link_reset(l);
l->state = LINK_RESET;
l->ackers = 0;
@@ -549,6 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
int tipc_link_fsm_evt(struct tipc_link *l, int evt)
{
int rc = 0;
+ int old_state = l->state;
switch (l->state) {
case LINK_RESETTING:
@@ -695,10 +697,12 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt)
default:
pr_err("Unknown FSM state %x in %s\n", l->state, l->name);
}
+ trace_tipc_link_fsm(l->name, old_state, l->state, evt);
return rc;
illegal_evt:
pr_err("Illegal FSM event %x in state %x on link %s\n",
evt, l->state, l->name);
+ trace_tipc_link_fsm(l->name, old_state, l->state, evt);
return rc;
}
@@ -743,6 +747,18 @@ static void link_profile_stats(struct tipc_link *l)
l->stats.msg_length_profile[6]++;
}
+/**
+ * tipc_link_too_silent - check if link is "too silent"
+ * @l: tipc link to be checked
+ *
+ * Returns true if the link 'silent_intv_cnt' is about to reach the
+ * 'abort_limit' value, otherwise false
+ */
+bool tipc_link_too_silent(struct tipc_link *l)
+{
+ return (l->silent_intv_cnt + 2 > l->abort_limit);
+}
+
/* tipc_link_timeout - perform periodic task as instructed from node timeout
*/
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -756,6 +772,8 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
u16 bc_acked = l->bc_rcvlink->acked;
struct tipc_mon_state *mstate = &l->mon_state;
+ trace_tipc_link_timeout(l, TIPC_DUMP_NONE, " ");
+ trace_tipc_link_too_silent(l, TIPC_DUMP_ALL, " ");
switch (l->state) {
case LINK_ESTABLISHED:
case LINK_SYNCHING:
@@ -818,6 +836,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
skb_queue_tail(&l->wakeupq, skb);
l->stats.link_congs++;
+ trace_tipc_link_conges(l, TIPC_DUMP_ALL, "wakeup scheduled!");
return -ELINKCONG;
}
@@ -948,6 +967,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
}
__skb_dequeue(list);
__skb_queue_tail(transmq, skb);
+ /* next retransmit attempt */
+ if (link_is_bc_sndlink(l))
+ TIPC_SKB_CB(skb)->nxt_retr =
+ jiffies + TIPC_BC_RETR_LIM;
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
l->rcv_unacked = 0;
@@ -995,6 +1018,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
hdr = buf_msg(skb);
l->backlog[msg_importance(hdr)].len--;
__skb_queue_tail(&l->transmq, skb);
+ /* next retransmit attempt */
+ if (link_is_bc_sndlink(l))
+ TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
+
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
msg_set_seqno(hdr, seqno);
@@ -1036,14 +1063,20 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
if (!skb)
return 0;
+ if (less(to, from))
+ return 0;
+ trace_tipc_link_retrans(r, from, to, &l->transmq);
/* Detect repeated retransmit failures on same packet */
- if (r->last_retransm != buf_seqno(skb)) {
- r->last_retransm = buf_seqno(skb);
+ if (r->prev_from != from) {
+ r->prev_from = from;
r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
r->stale_cnt = 0;
} else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
link_retransmit_failure(l, skb);
+ trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
+ trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
+ trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
if (link_is_bc_sndlink(l))
return TIPC_LINK_DOWN_EVT;
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
@@ -1055,6 +1088,11 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
continue;
if (more(msg_seqno(hdr), to))
break;
+ if (link_is_bc_sndlink(l)) {
+ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
+ continue;
+ TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
+ }
_skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
@@ -1398,6 +1436,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
l->stats.sent_nacks++;
skb->priority = TC_PRIO_CONTROL;
__skb_queue_tail(xmitq, skb);
+ trace_tipc_proto_build(skb, false, l->name);
}
void tipc_link_create_dummy_tnl_msg(struct tipc_link *l,
@@ -1561,6 +1600,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
char *if_name;
int rc = 0;
+ trace_tipc_proto_rcv(skb, false, l->name);
if (tipc_link_is_blocked(l) || !xmitq)
goto exit;
@@ -1571,8 +1611,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
hdr = buf_msg(skb);
data = msg_data(hdr);
- if (!tipc_link_validate_msg(l, hdr))
+ if (!tipc_link_validate_msg(l, hdr)) {
+ trace_tipc_skb_dump(skb, false, "PROTO invalid (1)!");
+ trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (1)!");
goto exit;
+ }
switch (mtyp) {
case RESET_MSG:
@@ -1737,42 +1780,6 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
}
-/* link_bc_retr eval()- check if the indicated range can be retransmitted now
- * - Adjust permitted range if there is overlap with previous retransmission
- */
-static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
-{
- unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
-
- if (less(*to, *from))
- return false;
-
- /* New retransmission request */
- if ((elapsed > TIPC_BC_RETR_LIMIT) ||
- less(*to, l->prev_from) || more(*from, l->prev_to)) {
- l->prev_from = *from;
- l->prev_to = *to;
- l->prev_retr = jiffies;
- return true;
- }
-
- /* Inside range of previous retransmit */
- if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
- return false;
-
- /* Fully or partially outside previous range => exclude overlap */
- if (less(*from, l->prev_from)) {
- *to = l->prev_from - 1;
- l->prev_from = *from;
- }
- if (more(*to, l->prev_to)) {
- *from = l->prev_to + 1;
- l->prev_to = *to;
- }
- l->prev_retr = jiffies;
- return true;
-}
-
/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
*/
int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
@@ -1803,8 +1810,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
- if (link_bc_retr_eval(snd_l, &from, &to))
- rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
+ rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
@@ -1852,6 +1858,7 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
if (!more(acked, l->acked))
return;
+ trace_tipc_link_bc_ack(l, l->acked, acked, &snd_l->transmq);
/* Skip over packets peer has already acked */
skb_queue_walk(&snd_l->transmq, skb) {
if (more(buf_seqno(skb), l->acked))
@@ -2255,3 +2262,122 @@ void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
{
l->abort_limit = limit;
}
+
+char *tipc_link_name_ext(struct tipc_link *l, char *buf)
+{
+ if (!l)
+ scnprintf(buf, TIPC_MAX_LINK_NAME, "null");
+ else if (link_is_bc_sndlink(l))
+ scnprintf(buf, TIPC_MAX_LINK_NAME, "broadcast-sender");
+ else if (link_is_bc_rcvlink(l))
+ scnprintf(buf, TIPC_MAX_LINK_NAME,
+ "broadcast-receiver, peer %x", l->addr);
+ else
+ memcpy(buf, l->name, TIPC_MAX_LINK_NAME);
+
+ return buf;
+}
+
+/**
+ * tipc_link_dump - dump TIPC link data
+ * @l: tipc link to be dumped
+ * @dqueues: bitmask to decide if any link queue to be dumped?
+ * - TIPC_DUMP_NONE: don't dump link queues
+ * - TIPC_DUMP_TRANSMQ: dump link transmq queue
+ * - TIPC_DUMP_BACKLOGQ: dump link backlog queue
+ * - TIPC_DUMP_DEFERDQ: dump link deferd queue
+ * - TIPC_DUMP_INPUTQ: dump link input queue
+ * - TIPC_DUMP_WAKEUP: dump link wakeup queue
+ * - TIPC_DUMP_ALL: dump all the link queues above
+ * @buf: returned buffer of dump data in format
+ */
+int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf)
+{
+ int i = 0;
+ size_t sz = (dqueues) ? LINK_LMAX : LINK_LMIN;
+ struct sk_buff_head *list;
+ struct sk_buff *hskb, *tskb;
+ u32 len;
+
+ if (!l) {
+ i += scnprintf(buf, sz, "link data: (null)\n");
+ return i;
+ }
+
+ i += scnprintf(buf, sz, "link data: %x", l->addr);
+ i += scnprintf(buf + i, sz - i, " %x", l->state);
+ i += scnprintf(buf + i, sz - i, " %u", l->in_session);
+ i += scnprintf(buf + i, sz - i, " %u", l->session);
+ i += scnprintf(buf + i, sz - i, " %u", l->peer_session);
+ i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt);
+ i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt);
+ i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt_state);
+ i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt_state);
+ i += scnprintf(buf + i, sz - i, " %x", l->peer_caps);
+ i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", l->prev_from);
+ i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", l->acked);
+
+ list = &l->transmq;
+ len = skb_queue_len(list);
+ hskb = skb_peek(list);
+ tskb = skb_peek_tail(list);
+ i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
+ (hskb) ? msg_seqno(buf_msg(hskb)) : 0,
+ (tskb) ? msg_seqno(buf_msg(tskb)) : 0);
+
+ list = &l->deferdq;
+ len = skb_queue_len(list);
+ hskb = skb_peek(list);
+ tskb = skb_peek_tail(list);
+ i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
+ (hskb) ? msg_seqno(buf_msg(hskb)) : 0,
+ (tskb) ? msg_seqno(buf_msg(tskb)) : 0);
+
+ list = &l->backlogq;
+ len = skb_queue_len(list);
+ hskb = skb_peek(list);
+ tskb = skb_peek_tail(list);
+ i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
+ (hskb) ? msg_seqno(buf_msg(hskb)) : 0,
+ (tskb) ? msg_seqno(buf_msg(tskb)) : 0);
+
+ list = l->inputq;
+ len = skb_queue_len(list);
+ hskb = skb_peek(list);
+ tskb = skb_peek_tail(list);
+ i += scnprintf(buf + i, sz - i, " | %u %u %u\n", len,
+ (hskb) ? msg_seqno(buf_msg(hskb)) : 0,
+ (tskb) ? msg_seqno(buf_msg(tskb)) : 0);
+
+ if (dqueues & TIPC_DUMP_TRANSMQ) {
+ i += scnprintf(buf + i, sz - i, "transmq: ");
+ i += tipc_list_dump(&l->transmq, false, buf + i);
+ }
+ if (dqueues & TIPC_DUMP_BACKLOGQ) {
+ i += scnprintf(buf + i, sz - i,
+ "backlogq: <%u %u %u %u %u>, ",
+ l->backlog[TIPC_LOW_IMPORTANCE].len,
+ l->backlog[TIPC_MEDIUM_IMPORTANCE].len,
+ l->backlog[TIPC_HIGH_IMPORTANCE].len,
+ l->backlog[TIPC_CRITICAL_IMPORTANCE].len,
+ l->backlog[TIPC_SYSTEM_IMPORTANCE].len);
+ i += tipc_list_dump(&l->backlogq, false, buf + i);
+ }
+ if (dqueues & TIPC_DUMP_DEFERDQ) {
+ i += scnprintf(buf + i, sz - i, "deferdq: ");
+ i += tipc_list_dump(&l->deferdq, false, buf + i);
+ }
+ if (dqueues & TIPC_DUMP_INPUTQ) {
+ i += scnprintf(buf + i, sz - i, "inputq: ");
+ i += tipc_list_dump(l->inputq, false, buf + i);
+ }
+ if (dqueues & TIPC_DUMP_WAKEUP) {
+ i += scnprintf(buf + i, sz - i, "wakeup: ");
+ i += tipc_list_dump(&l->wakeupq, false, buf + i);
+ }
+
+ return i;
+}
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 90488c538a4e..8439e0ee53a8 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -109,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l);
u16 tipc_link_acked(struct tipc_link *l);
u32 tipc_link_id(struct tipc_link *l);
char *tipc_link_name(struct tipc_link *l);
+char *tipc_link_name_ext(struct tipc_link *l, char *buf);
u32 tipc_link_state(struct tipc_link *l);
char tipc_link_plane(struct tipc_link *l);
int tipc_link_prio(struct tipc_link *l);
@@ -147,4 +148,5 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
struct sk_buff_head *xmitq);
int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *xmitq);
+bool tipc_link_too_silent(struct tipc_link *l);
#endif
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index a2879e6ec5b6..a0924956bb61 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -105,6 +105,7 @@ struct tipc_skb_cb {
u32 bytes_read;
u32 orig_member;
struct sk_buff *tail;
+ unsigned long nxt_retr;
bool validated;
u16 chain_imp;
u16 ackers;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 6376467e78f8..21f6ccc89401 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -951,8 +951,11 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
u32 node;
struct nlattr *con[TIPC_NLA_CON_MAX + 1];
- nla_parse_nested(con, TIPC_NLA_CON_MAX,
- sock[TIPC_NLA_SOCK_CON], NULL, NULL);
+ err = nla_parse_nested(con, TIPC_NLA_CON_MAX,
+ sock[TIPC_NLA_SOCK_CON], NULL, NULL);
+
+ if (err)
+ return err;
node = nla_get_u32(con[TIPC_NLA_CON_NODE]);
tipc_tlv_sprintf(msg->rep, " connected to <%u.%u.%u:%u>",
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 2afc4f8c37a7..db2a6c3e0be9 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -43,6 +43,7 @@
#include "monitor.h"
#include "discover.h"
#include "netlink.h"
+#include "trace.h"
#define INVALID_NODE_SIG 0x10000
#define NODE_CLEANUP_AFTER 300000
@@ -432,6 +433,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
break;
}
list_add_tail_rcu(&n->list, &temp_node->list);
+ trace_tipc_node_create(n, true, " ");
exit:
spin_unlock_bh(&tn->node_list_lock);
return n;
@@ -459,6 +461,7 @@ static void tipc_node_delete_from_list(struct tipc_node *node)
static void tipc_node_delete(struct tipc_node *node)
{
+ trace_tipc_node_delete(node, true, " ");
tipc_node_delete_from_list(node);
del_timer_sync(&node->timer);
@@ -584,12 +587,15 @@ static void tipc_node_clear_links(struct tipc_node *node)
/* tipc_node_cleanup - delete nodes that does not
* have active links for NODE_CLEANUP_AFTER time
*/
-static int tipc_node_cleanup(struct tipc_node *peer)
+static bool tipc_node_cleanup(struct tipc_node *peer)
{
struct tipc_net *tn = tipc_net(peer->net);
bool deleted = false;
- spin_lock_bh(&tn->node_list_lock);
+ /* If lock held by tipc_node_stop() the node will be deleted anyway */
+ if (!spin_trylock_bh(&tn->node_list_lock))
+ return false;
+
tipc_node_write_lock(peer);
if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) {
@@ -613,6 +619,7 @@ static void tipc_node_timeout(struct timer_list *t)
int bearer_id;
int rc = 0;
+ trace_tipc_node_timeout(n, false, " ");
if (!node_is_up(n) && tipc_node_cleanup(n)) {
/*Removing the reference of Timer*/
tipc_node_put(n);
@@ -621,6 +628,12 @@ static void tipc_node_timeout(struct timer_list *t)
__skb_queue_head_init(&xmitq);
+ /* Initial node interval to value larger (10 seconds), then it will be
+ * recalculated with link lowest tolerance
+ */
+ tipc_node_read_lock(n);
+ n->keepalive_intv = 10000;
+ tipc_node_read_unlock(n);
for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) {
tipc_node_read_lock(n);
le = &n->links[bearer_id];
@@ -672,6 +685,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
pr_debug("Established link <%s> on network plane %c\n",
tipc_link_name(nl), tipc_link_plane(nl));
+ trace_tipc_node_link_up(n, true, " ");
/* Ensure that a STATE message goes first */
tipc_link_build_state_msg(nl, xmitq);
@@ -774,6 +788,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
if (tipc_link_peer_is_down(l))
tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT);
+ trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!");
tipc_link_fsm_evt(l, LINK_RESET_EVT);
tipc_link_reset(l);
tipc_link_build_reset_msg(l, xmitq);
@@ -791,6 +806,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1);
tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq);
+ trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!");
tipc_link_reset(l);
tipc_link_fsm_evt(l, LINK_RESET_EVT);
tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
@@ -823,6 +839,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
/* Defuse pending tipc_node_link_up() */
tipc_link_fsm_evt(l, LINK_RESET_EVT);
}
+ trace_tipc_node_link_down(n, true, "node link down or deleted!");
tipc_node_write_unlock(n);
if (delete)
tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
@@ -1012,6 +1029,7 @@ void tipc_node_check_dest(struct net *net, u32 addr,
*respond = false;
goto exit;
}
+ trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!");
tipc_link_reset(l);
tipc_link_fsm_evt(l, LINK_RESET_EVT);
if (n->state == NODE_FAILINGOVER)
@@ -1051,6 +1069,7 @@ static void tipc_node_reset_links(struct tipc_node *n)
pr_warn("Resetting all links to %x\n", n->addr);
+ trace_tipc_node_reset_links(n, true, " ");
for (i = 0; i < MAX_BEARERS; i++) {
tipc_node_link_down(n, i, false);
}
@@ -1226,11 +1245,13 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
pr_err("Unknown node fsm state %x\n", state);
break;
}
+ trace_tipc_node_fsm(n->peer_id, n->state, state, evt);
n->state = state;
return;
illegal_evt:
pr_err("Illegal node fsm evt %x in state %x\n", evt, state);
+ trace_tipc_node_fsm(n->peer_id, n->state, state, evt);
}
static void node_lost_contact(struct tipc_node *n,
@@ -1244,6 +1265,7 @@ static void node_lost_contact(struct tipc_node *n,
pr_debug("Lost contact with %x\n", n->addr);
n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
+ trace_tipc_node_lost_contact(n, true, " ");
/* Clean up broadcast state */
tipc_bcast_remove_peer(n->net, n->bc_entry.link);
@@ -1540,6 +1562,10 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
if (!skb_queue_empty(&be->inputq1))
tipc_node_mcast_rcv(n);
+ /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */
+ if (!skb_queue_empty(&n->bc_entry.namedq))
+ tipc_named_rcv(net, &n->bc_entry.namedq);
+
/* If reassembly or retransmission failure => reset all links to peer */
if (rc & TIPC_LINK_DOWN_EVT)
tipc_node_reset_links(n);
@@ -1568,6 +1594,10 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
struct tipc_media_addr *maddr;
int pb_id;
+ if (trace_tipc_node_check_state_enabled()) {
+ trace_tipc_skb_dump(skb, false, "skb for node state check");
+ trace_tipc_node_check_state(n, true, " ");
+ }
l = n->links[bearer_id].link;
if (!l)
return false;
@@ -1585,8 +1615,11 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
}
}
- if (!tipc_link_validate_msg(l, hdr))
+ if (!tipc_link_validate_msg(l, hdr)) {
+ trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!");
+ trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!");
return false;
+ }
/* Check and update node accesibility if applicable */
if (state == SELF_UP_PEER_COMING) {
@@ -1616,6 +1649,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
syncpt = oseqno + exp_pkts - 1;
if (pl && tipc_link_is_up(pl)) {
__tipc_node_link_down(n, &pb_id, xmitq, &maddr);
+ trace_tipc_node_link_down(n, true,
+ "node link down <- failover!");
tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl),
tipc_link_inputq(l));
}
@@ -2422,3 +2457,65 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
return skb->len;
}
+
+u32 tipc_node_get_addr(struct tipc_node *node)
+{
+ return (node) ? node->addr : 0;
+}
+
+/**
+ * tipc_node_dump - dump TIPC node data
+ * @n: tipc node to be dumped
+ * @more: dump more?
+ * - false: dump only tipc node data
+ * - true: dump node link data as well
+ * @buf: returned buffer of dump data in format
+ */
+int tipc_node_dump(struct tipc_node *n, bool more, char *buf)
+{
+ int i = 0;
+ size_t sz = (more) ? NODE_LMAX : NODE_LMIN;
+
+ if (!n) {
+ i += scnprintf(buf, sz, "node data: (null)\n");
+ return i;
+ }
+
+ i += scnprintf(buf, sz, "node data: %x", n->addr);
+ i += scnprintf(buf + i, sz - i, " %x", n->state);
+ i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]);
+ i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]);
+ i += scnprintf(buf + i, sz - i, " %x", n->action_flags);
+ i += scnprintf(buf + i, sz - i, " %u", n->failover_sent);
+ i += scnprintf(buf + i, sz - i, " %u", n->sync_point);
+ i += scnprintf(buf + i, sz - i, " %d", n->link_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", n->working_links);
+ i += scnprintf(buf + i, sz - i, " %x", n->capabilities);
+ i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv);
+
+ if (!more)
+ return i;
+
+ i += scnprintf(buf + i, sz - i, "link_entry[0]:\n");
+ i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu);
+ i += scnprintf(buf + i, sz - i, " media: ");
+ i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr);
+ i += scnprintf(buf + i, sz - i, "\n");
+ i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i);
+ i += scnprintf(buf + i, sz - i, " inputq: ");
+ i += tipc_list_dump(&n->links[0].inputq, false, buf + i);
+
+ i += scnprintf(buf + i, sz - i, "link_entry[1]:\n");
+ i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu);
+ i += scnprintf(buf + i, sz - i, " media: ");
+ i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr);
+ i += scnprintf(buf + i, sz - i, "\n");
+ i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i);
+ i += scnprintf(buf + i, sz - i, " inputq: ");
+ i += tipc_list_dump(&n->links[1].inputq, false, buf + i);
+
+ i += scnprintf(buf + i, sz - i, "bclink:\n ");
+ i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i);
+
+ return i;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 03f5efb62cfb..4f59a30e989a 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -65,6 +65,7 @@ enum {
void tipc_node_stop(struct net *net);
bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
+u32 tipc_node_get_addr(struct tipc_node *node);
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_bearer *bearer,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index b57b1be7252b..1217c90a363b 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -46,6 +46,7 @@
#include "bcast.h"
#include "netlink.h"
#include "group.h"
+#include "trace.h"
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */
@@ -233,6 +234,7 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen)
*/
static void tsk_advance_rx_queue(struct sock *sk)
{
+ trace_tipc_sk_advance_rx(sk, NULL, TIPC_DUMP_SK_RCVQ, " ");
kfree_skb(__skb_dequeue(&sk->sk_receive_queue));
}
@@ -247,6 +249,7 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
if (!tipc_msg_reverse(onode, &skb, err))
return;
+ trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE, "@sk_respond!");
dnode = msg_destnode(buf_msg(skb));
selector = msg_origport(buf_msg(skb));
tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
@@ -482,6 +485,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
tsk_set_unreliable(tsk, true);
}
+ trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " ");
return 0;
}
@@ -571,6 +575,7 @@ static int tipc_release(struct socket *sock)
tsk = tipc_sk(sk);
lock_sock(sk);
+ trace_tipc_sk_release(sk, NULL, TIPC_DUMP_ALL, " ");
__tipc_shutdown(sock, TIPC_ERR_NO_PORT);
sk->sk_shutdown = SHUTDOWN_MASK;
tipc_sk_leave(tsk);
@@ -718,6 +723,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
__poll_t revents = 0;
sock_poll_wait(file, sock, wait);
+ trace_tipc_sk_poll(sk, NULL, TIPC_DUMP_ALL, " ");
if (sk->sk_shutdown & RCV_SHUTDOWN)
revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
@@ -804,9 +810,12 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
/* Send message if build was successful */
- if (unlikely(rc == dlen))
+ if (unlikely(rc == dlen)) {
+ trace_tipc_sk_sendmcast(sk, skb_peek(&pkts),
+ TIPC_DUMP_SK_SNDQ, " ");
rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
&tsk->cong_link_cnt);
+ }
tipc_nlist_purge(&dsts);
@@ -880,7 +889,6 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
int blks = tsk_blocks(GROUP_H_SIZE + dlen);
struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_group *grp = tsk->group;
struct net *net = sock_net(sk);
struct tipc_member *mb = NULL;
u32 node, port;
@@ -894,7 +902,9 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
/* Block or return if destination link or member is congested */
rc = tipc_wait_for_cond(sock, &timeout,
!tipc_dest_find(&tsk->cong_links, node, 0) &&
- !tipc_group_cong(grp, node, port, blks, &mb));
+ tsk->group &&
+ !tipc_group_cong(tsk->group, node, port, blks,
+ &mb));
if (unlikely(rc))
return rc;
@@ -924,7 +934,6 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
struct tipc_sock *tsk = tipc_sk(sk);
struct list_head *cong_links = &tsk->cong_links;
int blks = tsk_blocks(GROUP_H_SIZE + dlen);
- struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
struct tipc_member *first = NULL;
struct tipc_member *mbr = NULL;
@@ -941,9 +950,10 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
type = msg_nametype(hdr);
inst = dest->addr.name.name.instance;
scope = msg_lookup_scope(hdr);
- exclude = tipc_group_exclude(grp);
while (++lookups < 4) {
+ exclude = tipc_group_exclude(tsk->group);
+
first = NULL;
/* Look for a non-congested destination member, if any */
@@ -952,7 +962,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
&dstcnt, exclude, false))
return -EHOSTUNREACH;
tipc_dest_pop(&dsts, &node, &port);
- cong = tipc_group_cong(grp, node, port, blks, &mbr);
+ cong = tipc_group_cong(tsk->group, node, port, blks,
+ &mbr);
if (!cong)
break;
if (mbr == first)
@@ -971,7 +982,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
/* Block or return if destination link or member is congested */
rc = tipc_wait_for_cond(sock, &timeout,
!tipc_dest_find(cong_links, node, 0) &&
- !tipc_group_cong(grp, node, port,
+ tsk->group &&
+ !tipc_group_cong(tsk->group, node, port,
blks, &mbr));
if (unlikely(rc))
return rc;
@@ -1006,8 +1018,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_group *grp = tsk->group;
- struct tipc_nlist *dsts = tipc_group_dests(grp);
+ struct tipc_nlist *dsts;
struct tipc_mc_method *method = &tsk->mc_method;
bool ack = method->mandatory && method->rcast;
int blks = tsk_blocks(MCAST_H_SIZE + dlen);
@@ -1016,15 +1027,17 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
struct sk_buff_head pkts;
int rc = -EHOSTUNREACH;
- if (!dsts->local && !dsts->remote)
- return -EHOSTUNREACH;
-
/* Block or return if any destination link or member is congested */
- rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt &&
- !tipc_group_bc_cong(grp, blks));
+ rc = tipc_wait_for_cond(sock, &timeout,
+ !tsk->cong_link_cnt && tsk->group &&
+ !tipc_group_bc_cong(tsk->group, blks));
if (unlikely(rc))
return rc;
+ dsts = tipc_group_dests(tsk->group);
+ if (!dsts->local && !dsts->remote)
+ return -EHOSTUNREACH;
+
/* Complete message header */
if (dest) {
msg_set_type(hdr, TIPC_GRP_MCAST_MSG);
@@ -1036,7 +1049,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
msg_set_hdr_sz(hdr, GROUP_H_SIZE);
msg_set_destport(hdr, 0);
msg_set_destnode(hdr, 0);
- msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp));
+ msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(tsk->group));
/* Avoid getting stuck with repeated forced replicasts */
msg_set_grp_bc_ack_req(hdr, ack);
@@ -1208,8 +1221,10 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
bool conn_cong;
/* Ignore if connection cannot be validated: */
- if (!tsk_peer_msg(tsk, hdr))
+ if (!tsk_peer_msg(tsk, hdr)) {
+ trace_tipc_sk_drop_msg(sk, skb, TIPC_DUMP_NONE, "@proto_rcv!");
goto exit;
+ }
if (unlikely(msg_errcode(hdr))) {
tipc_set_sk_state(sk, TIPC_DISCONNECTING);
@@ -1377,6 +1392,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
return -ENOMEM;
+ trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
if (unlikely(rc == -ELINKCONG)) {
tipc_dest_push(clinks, dnode, 0);
@@ -1454,6 +1470,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
if (unlikely(rc != send))
break;
+ trace_tipc_sk_sendstream(sk, skb_peek(&pkts),
+ TIPC_DUMP_SK_SNDQ, " ");
rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
if (unlikely(rc == -ELINKCONG)) {
tsk->cong_link_cnt = 1;
@@ -2128,6 +2146,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head inputq;
int limit, err = TIPC_OK;
+ trace_tipc_sk_filter_rcv(sk, skb, TIPC_DUMP_ALL, " ");
TIPC_SKB_CB(skb)->bytes_read = 0;
__skb_queue_head_init(&inputq);
__skb_queue_tail(&inputq, skb);
@@ -2147,17 +2166,25 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
(!grp && msg_in_group(hdr)))
err = TIPC_ERR_NO_PORT;
else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) {
+ trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL,
+ "err_overload2!");
atomic_inc(&sk->sk_drops);
err = TIPC_ERR_OVERLOAD;
}
if (unlikely(err)) {
- tipc_skb_reject(net, err, skb, xmitq);
+ if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) {
+ trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE,
+ "@filter_rcv!");
+ __skb_queue_tail(xmitq, skb);
+ }
err = TIPC_OK;
continue;
}
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
+ trace_tipc_sk_overlimit2(sk, skb, TIPC_DUMP_ALL,
+ "rcvq >90% allocated!");
sk->sk_data_ready(sk);
}
}
@@ -2223,14 +2250,21 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
if (!sk->sk_backlog.len)
atomic_set(dcnt, 0);
lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
- if (likely(!sk_add_backlog(sk, skb, lim)))
+ if (likely(!sk_add_backlog(sk, skb, lim))) {
+ trace_tipc_sk_overlimit1(sk, skb, TIPC_DUMP_ALL,
+ "bklg & rcvq >90% allocated!");
continue;
+ }
+ trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!");
/* Overload => reject message back to sender */
onode = tipc_own_addr(sock_net(sk));
atomic_inc(&sk->sk_drops);
- if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD))
+ if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) {
+ trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL,
+ "@sk_enqueue!");
__skb_queue_tail(xmitq, skb);
+ }
break;
}
}
@@ -2279,6 +2313,8 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
/* Prepare for message rejection */
if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err))
continue;
+
+ trace_tipc_sk_rej_msg(NULL, skb, TIPC_DUMP_NONE, "@sk_rcv!");
xmit:
dnode = msg_destnode(buf_msg(skb));
tipc_node_xmit_skb(net, skb, dnode, dport);
@@ -2552,6 +2588,7 @@ static int tipc_shutdown(struct socket *sock, int how)
lock_sock(sk);
+ trace_tipc_sk_shutdown(sk, NULL, TIPC_DUMP_ALL, " ");
__tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
sk->sk_shutdown = SEND_SHUTDOWN;
@@ -2724,11 +2761,15 @@ void tipc_sk_reinit(struct net *net)
rhashtable_walk_start(&iter);
while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
- spin_lock_bh(&tsk->sk.sk_lock.slock);
+ sock_hold(&tsk->sk);
+ rhashtable_walk_stop(&iter);
+ lock_sock(&tsk->sk);
msg = &tsk->phdr;
msg_set_prevnode(msg, tipc_own_addr(net));
msg_set_orignode(msg, tipc_own_addr(net));
- spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ release_sock(&tsk->sk);
+ rhashtable_walk_start(&iter);
+ sock_put(&tsk->sk);
}
rhashtable_walk_stop(&iter);
@@ -3564,3 +3605,187 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+
+/**
+ * tipc_sk_filtering - check if a socket should be traced
+ * @sk: the socket to be examined
+ * @sysctl_tipc_sk_filter[]: the socket tuple for filtering,
+ * (portid, sock type, name type, name lower, name upper)
+ *
+ * Returns true if the socket meets the socket tuple data
+ * (value 0 = 'any') or when there is no tuple set (all = 0),
+ * otherwise false
+ */
+bool tipc_sk_filtering(struct sock *sk)
+{
+ struct tipc_sock *tsk;
+ struct publication *p;
+ u32 _port, _sktype, _type, _lower, _upper;
+ u32 type = 0, lower = 0, upper = 0;
+
+ if (!sk)
+ return true;
+
+ tsk = tipc_sk(sk);
+
+ _port = sysctl_tipc_sk_filter[0];
+ _sktype = sysctl_tipc_sk_filter[1];
+ _type = sysctl_tipc_sk_filter[2];
+ _lower = sysctl_tipc_sk_filter[3];
+ _upper = sysctl_tipc_sk_filter[4];
+
+ if (!_port && !_sktype && !_type && !_lower && !_upper)
+ return true;
+
+ if (_port)
+ return (_port == tsk->portid);
+
+ if (_sktype && _sktype != sk->sk_type)
+ return false;
+
+ if (tsk->published) {
+ p = list_first_entry_or_null(&tsk->publications,
+ struct publication, binding_sock);
+ if (p) {
+ type = p->type;
+ lower = p->lower;
+ upper = p->upper;
+ }
+ }
+
+ if (!tipc_sk_type_connectionless(sk)) {
+ type = tsk->conn_type;
+ lower = tsk->conn_instance;
+ upper = tsk->conn_instance;
+ }
+
+ if ((_type && _type != type) || (_lower && _lower != lower) ||
+ (_upper && _upper != upper))
+ return false;
+
+ return true;
+}
+
+u32 tipc_sock_get_portid(struct sock *sk)
+{
+ return (sk) ? (tipc_sk(sk))->portid : 0;
+}
+
+/**
+ * tipc_sk_overlimit1 - check if socket rx queue is about to be overloaded,
+ * both the rcv and backlog queues are considered
+ * @sk: tipc sk to be checked
+ * @skb: tipc msg to be checked
+ *
+ * Returns true if the socket rx queue allocation is > 90%, otherwise false
+ */
+
+bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb)
+{
+ atomic_t *dcnt = &tipc_sk(sk)->dupl_rcvcnt;
+ unsigned int lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
+ unsigned int qsize = sk->sk_backlog.len + sk_rmem_alloc_get(sk);
+
+ return (qsize > lim * 90 / 100);
+}
+
+/**
+ * tipc_sk_overlimit2 - check if socket rx queue is about to be overloaded,
+ * only the rcv queue is considered
+ * @sk: tipc sk to be checked
+ * @skb: tipc msg to be checked
+ *
+ * Returns true if the socket rx queue allocation is > 90%, otherwise false
+ */
+
+bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb)
+{
+ unsigned int lim = rcvbuf_limit(sk, skb);
+ unsigned int qsize = sk_rmem_alloc_get(sk);
+
+ return (qsize > lim * 90 / 100);
+}
+
+/**
+ * tipc_sk_dump - dump TIPC socket
+ * @sk: tipc sk to be dumped
+ * @dqueues: bitmask to decide if any socket queue to be dumped?
+ * - TIPC_DUMP_NONE: don't dump socket queues
+ * - TIPC_DUMP_SK_SNDQ: dump socket send queue
+ * - TIPC_DUMP_SK_RCVQ: dump socket rcv queue
+ * - TIPC_DUMP_SK_BKLGQ: dump socket backlog queue
+ * - TIPC_DUMP_ALL: dump all the socket queues above
+ * @buf: returned buffer of dump data in format
+ */
+int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf)
+{
+ int i = 0;
+ size_t sz = (dqueues) ? SK_LMAX : SK_LMIN;
+ struct tipc_sock *tsk;
+ struct publication *p;
+ bool tsk_connected;
+
+ if (!sk) {
+ i += scnprintf(buf, sz, "sk data: (null)\n");
+ return i;
+ }
+
+ tsk = tipc_sk(sk);
+ tsk_connected = !tipc_sk_type_connectionless(sk);
+
+ i += scnprintf(buf, sz, "sk data: %u", sk->sk_type);
+ i += scnprintf(buf + i, sz - i, " %d", sk->sk_state);
+ i += scnprintf(buf + i, sz - i, " %x", tsk_own_node(tsk));
+ i += scnprintf(buf + i, sz - i, " %u", tsk->portid);
+ i += scnprintf(buf + i, sz - i, " | %u", tsk_connected);
+ if (tsk_connected) {
+ i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk));
+ i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk));
+ i += scnprintf(buf + i, sz - i, " %u", tsk->conn_type);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->conn_instance);
+ }
+ i += scnprintf(buf + i, sz - i, " | %u", tsk->published);
+ if (tsk->published) {
+ p = list_first_entry_or_null(&tsk->publications,
+ struct publication, binding_sock);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->type : 0);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->lower : 0);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->upper : 0);
+ }
+ i += scnprintf(buf + i, sz - i, " | %u", tsk->snd_win);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_win);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->max_pkt);
+ i += scnprintf(buf + i, sz - i, " %x", tsk->peer_caps);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->cong_link_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->snt_unacked);
+ i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_unacked);
+ i += scnprintf(buf + i, sz - i, " %u", atomic_read(&tsk->dupl_rcvcnt));
+ i += scnprintf(buf + i, sz - i, " %u", sk->sk_shutdown);
+ i += scnprintf(buf + i, sz - i, " | %d", sk_wmem_alloc_get(sk));
+ i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf);
+ i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk));
+ i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf);
+ i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len);
+
+ if (dqueues & TIPC_DUMP_SK_SNDQ) {
+ i += scnprintf(buf + i, sz - i, "sk_write_queue: ");
+ i += tipc_list_dump(&sk->sk_write_queue, false, buf + i);
+ }
+
+ if (dqueues & TIPC_DUMP_SK_RCVQ) {
+ i += scnprintf(buf + i, sz - i, "sk_receive_queue: ");
+ i += tipc_list_dump(&sk->sk_receive_queue, false, buf + i);
+ }
+
+ if (dqueues & TIPC_DUMP_SK_BKLGQ) {
+ i += scnprintf(buf + i, sz - i, "sk_backlog:\n head ");
+ i += tipc_skb_dump(sk->sk_backlog.head, false, buf + i);
+ if (sk->sk_backlog.tail != sk->sk_backlog.head) {
+ i += scnprintf(buf + i, sz - i, " tail ");
+ i += tipc_skb_dump(sk->sk_backlog.tail, false,
+ buf + i);
+ }
+ }
+
+ return i;
+}
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index 5e575f205afe..235b9679acee 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -71,4 +71,8 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
int tipc_dump_start(struct netlink_callback *cb);
int __tipc_dump_start(struct netlink_callback *cb, struct net *net);
int tipc_dump_done(struct netlink_callback *cb);
+u32 tipc_sock_get_portid(struct sock *sk);
+bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb);
+bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb);
+
#endif
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 1a779b1e8510..3481e4906bd6 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -34,6 +34,7 @@
*/
#include "core.h"
+#include "trace.h"
#include <linux/sysctl.h>
@@ -54,6 +55,13 @@ static struct ctl_table tipc_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "sk_filter",
+ .data = &sysctl_tipc_sk_filter,
+ .maxlen = sizeof(sysctl_tipc_sk_filter),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{}
};
diff --git a/net/tipc/trace.c b/net/tipc/trace.c
new file mode 100644
index 000000000000..964823841efe
--- /dev/null
+++ b/net/tipc/trace.c
@@ -0,0 +1,206 @@
+/*
+ * net/tipc/trace.c: TIPC tracepoints code
+ *
+ * Copyright (c) 2018, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+/**
+ * socket tuples for filtering in socket traces:
+ * (portid, sock type, name type, name lower, name upper)
+ */
+unsigned long sysctl_tipc_sk_filter[5] __read_mostly = {0, };
+
+/**
+ * tipc_skb_dump - dump TIPC skb data
+ * @skb: skb to be dumped
+ * @more: dump more?
+ * - false: dump only tipc msg data
+ * - true: dump kernel-related skb data and tipc cb[] array as well
+ * @buf: returned buffer of dump data in format
+ */
+int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf)
+{
+ int i = 0;
+ size_t sz = (more) ? SKB_LMAX : SKB_LMIN;
+ struct tipc_msg *hdr;
+ struct tipc_skb_cb *skbcb;
+
+ if (!skb) {
+ i += scnprintf(buf, sz, "msg: (null)\n");
+ return i;
+ }
+
+ hdr = buf_msg(skb);
+ skbcb = TIPC_SKB_CB(skb);
+
+ /* tipc msg data section */
+ i += scnprintf(buf, sz, "msg: %u", msg_user(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_type(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_hdr_sz(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_data_sz(hdr));
+ i += scnprintf(buf + i, sz - i, " %x", msg_orignode(hdr));
+ i += scnprintf(buf + i, sz - i, " %x", msg_destnode(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_seqno(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_ack(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_bcast_ack(hdr));
+ switch (msg_user(hdr)) {
+ case LINK_PROTOCOL:
+ i += scnprintf(buf + i, sz - i, " %c", msg_net_plane(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_probe(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_peer_stopping(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_session(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_next_sent(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_seq_gap(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_bc_snd_nxt(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_bc_gap(hdr));
+ break;
+ case TIPC_LOW_IMPORTANCE:
+ case TIPC_MEDIUM_IMPORTANCE:
+ case TIPC_HIGH_IMPORTANCE:
+ case TIPC_CRITICAL_IMPORTANCE:
+ case CONN_MANAGER:
+ case SOCK_WAKEUP:
+ i += scnprintf(buf + i, sz - i, " | %u", msg_origport(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_destport(hdr));
+ switch (msg_type(hdr)) {
+ case TIPC_NAMED_MSG:
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_nametype(hdr));
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_nameinst(hdr));
+ break;
+ case TIPC_MCAST_MSG:
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_nametype(hdr));
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_namelower(hdr));
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_nameupper(hdr));
+ break;
+ default:
+ break;
+ };
+ i += scnprintf(buf + i, sz - i, " | %u",
+ msg_src_droppable(hdr));
+ i += scnprintf(buf + i, sz - i, " %u",
+ msg_dest_droppable(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_errcode(hdr));
+ i += scnprintf(buf + i, sz - i, " %u", msg_reroute_cnt(hdr));
+ break;
+ default:
+ /* need more? */
+ break;
+ };
+
+ i += scnprintf(buf + i, sz - i, "\n");
+ if (!more)
+ return i;
+
+ /* kernel-related skb data section */
+ i += scnprintf(buf + i, sz - i, "skb: %s",
+ (skb->dev) ? skb->dev->name : "n/a");
+ i += scnprintf(buf + i, sz - i, " %u", skb->len);
+ i += scnprintf(buf + i, sz - i, " %u", skb->data_len);
+ i += scnprintf(buf + i, sz - i, " %u", skb->hdr_len);
+ i += scnprintf(buf + i, sz - i, " %u", skb->truesize);
+ i += scnprintf(buf + i, sz - i, " %u", skb_cloned(skb));
+ i += scnprintf(buf + i, sz - i, " %p", skb->sk);
+ i += scnprintf(buf + i, sz - i, " %u", skb_shinfo(skb)->nr_frags);
+ i += scnprintf(buf + i, sz - i, " %llx",
+ ktime_to_ms(skb_get_ktime(skb)));
+ i += scnprintf(buf + i, sz - i, " %llx\n",
+ ktime_to_ms(skb_hwtstamps(skb)->hwtstamp));
+
+ /* tipc skb cb[] data section */
+ i += scnprintf(buf + i, sz - i, "cb[]: %u", skbcb->bytes_read);
+ i += scnprintf(buf + i, sz - i, " %u", skbcb->orig_member);
+ i += scnprintf(buf + i, sz - i, " %u",
+ jiffies_to_msecs(skbcb->nxt_retr));
+ i += scnprintf(buf + i, sz - i, " %u", skbcb->validated);
+ i += scnprintf(buf + i, sz - i, " %u", skbcb->chain_imp);
+ i += scnprintf(buf + i, sz - i, " %u\n", skbcb->ackers);
+
+ return i;
+}
+
+/**
+ * tipc_list_dump - dump TIPC skb list/queue
+ * @list: list of skbs to be dumped
+ * @more: dump more?
+ * - false: dump only the head & tail skbs
+ * - true: dump the first & last 5 skbs
+ * @buf: returned buffer of dump data in format
+ */
+int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf)
+{
+ int i = 0;
+ size_t sz = (more) ? LIST_LMAX : LIST_LMIN;
+ u32 count, len;
+ struct sk_buff *hskb, *tskb, *skb, *tmp;
+
+ if (!list) {
+ i += scnprintf(buf, sz, "(null)\n");
+ return i;
+ }
+
+ len = skb_queue_len(list);
+ i += scnprintf(buf, sz, "len = %d\n", len);
+
+ if (!len)
+ return i;
+
+ if (!more) {
+ hskb = skb_peek(list);
+ i += scnprintf(buf + i, sz - i, " head ");
+ i += tipc_skb_dump(hskb, false, buf + i);
+ if (len > 1) {
+ tskb = skb_peek_tail(list);
+ i += scnprintf(buf + i, sz - i, " tail ");
+ i += tipc_skb_dump(tskb, false, buf + i);
+ }
+ } else {
+ count = 0;
+ skb_queue_walk_safe(list, skb, tmp) {
+ count++;
+ if (count == 6)
+ i += scnprintf(buf + i, sz - i, " .\n .\n");
+ if (count > 5 && count <= len - 5)
+ continue;
+ i += scnprintf(buf + i, sz - i, " #%d ", count);
+ i += tipc_skb_dump(skb, false, buf + i);
+ }
+ }
+ return i;
+}
diff --git a/net/tipc/trace.h b/net/tipc/trace.h
new file mode 100644
index 000000000000..4d8e00483afc
--- /dev/null
+++ b/net/tipc/trace.h
@@ -0,0 +1,431 @@
+/*
+ * net/tipc/trace.h: TIPC tracepoints
+ *
+ * Copyright (c) 2018, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tipc
+
+#if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TIPC_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "core.h"
+#include "link.h"
+#include "socket.h"
+#include "node.h"
+
+#define SKB_LMIN (100)
+#define SKB_LMAX (SKB_LMIN * 2)
+#define LIST_LMIN (SKB_LMIN * 3)
+#define LIST_LMAX (SKB_LMIN * 11)
+#define SK_LMIN (SKB_LMIN * 2)
+#define SK_LMAX (SKB_LMIN * 11)
+#define LINK_LMIN (SKB_LMIN)
+#define LINK_LMAX (SKB_LMIN * 16)
+#define NODE_LMIN (SKB_LMIN)
+#define NODE_LMAX (SKB_LMIN * 11)
+
+#ifndef __TIPC_TRACE_ENUM
+#define __TIPC_TRACE_ENUM
+enum {
+ TIPC_DUMP_NONE = 0,
+
+ TIPC_DUMP_TRANSMQ = 1,
+ TIPC_DUMP_BACKLOGQ = (1 << 1),
+ TIPC_DUMP_DEFERDQ = (1 << 2),
+ TIPC_DUMP_INPUTQ = (1 << 3),
+ TIPC_DUMP_WAKEUP = (1 << 4),
+
+ TIPC_DUMP_SK_SNDQ = (1 << 8),
+ TIPC_DUMP_SK_RCVQ = (1 << 9),
+ TIPC_DUMP_SK_BKLGQ = (1 << 10),
+ TIPC_DUMP_ALL = 0xffffu
+};
+#endif
+
+/* Link & Node FSM states: */
+#define state_sym(val) \
+ __print_symbolic(val, \
+ {(0xe), "ESTABLISHED" },\
+ {(0xe << 4), "ESTABLISHING" },\
+ {(0x1 << 8), "RESET" },\
+ {(0x2 << 12), "RESETTING" },\
+ {(0xd << 16), "PEER_RESET" },\
+ {(0xf << 20), "FAILINGOVER" },\
+ {(0xc << 24), "SYNCHING" },\
+ {(0xdd), "SELF_DOWN_PEER_DOWN" },\
+ {(0xaa), "SELF_UP_PEER_UP" },\
+ {(0xd1), "SELF_DOWN_PEER_LEAVING" },\
+ {(0xac), "SELF_UP_PEER_COMING" },\
+ {(0xca), "SELF_COMING_PEER_UP" },\
+ {(0x1d), "SELF_LEAVING_PEER_DOWN" },\
+ {(0xf0), "FAILINGOVER" },\
+ {(0xcc), "SYNCHING" })
+
+/* Link & Node FSM events: */
+#define evt_sym(val) \
+ __print_symbolic(val, \
+ {(0xec1ab1e), "ESTABLISH_EVT" },\
+ {(0x9eed0e), "PEER_RESET_EVT" },\
+ {(0xfa110e), "FAILURE_EVT" },\
+ {(0x10ca1d0e), "RESET_EVT" },\
+ {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\
+ {(0xfa110ede), "FAILOVER_END_EVT" },\
+ {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\
+ {(0xc1ccede), "SYNCH_END_EVT" },\
+ {(0xece), "SELF_ESTABL_CONTACT_EVT" },\
+ {(0x1ce), "SELF_LOST_CONTACT_EVT" },\
+ {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\
+ {(0x91ce), "PEER_LOST_CONTACT_EVT" },\
+ {(0xfbe), "FAILOVER_BEGIN_EVT" },\
+ {(0xfee), "FAILOVER_END_EVT" },\
+ {(0xcbe), "SYNCH_BEGIN_EVT" },\
+ {(0xcee), "SYNCH_END_EVT" })
+
+/* Bearer, net device events: */
+#define dev_evt_sym(val) \
+ __print_symbolic(val, \
+ {(NETDEV_CHANGE), "NETDEV_CHANGE" },\
+ {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\
+ {(NETDEV_UP), "NETDEV_UP" },\
+ {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\
+ {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\
+ {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\
+ {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" })
+
+extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly;
+
+int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf);
+int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf);
+int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf);
+int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf);
+int tipc_node_dump(struct tipc_node *n, bool more, char *buf);
+bool tipc_sk_filtering(struct sock *sk);
+
+DECLARE_EVENT_CLASS(tipc_skb_class,
+
+ TP_PROTO(struct sk_buff *skb, bool more, const char *header),
+
+ TP_ARGS(skb, more, header),
+
+ TP_STRUCT__entry(
+ __string(header, header)
+ __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN)
+ ),
+
+ TP_fast_assign(
+ __assign_str(header, header);
+ tipc_skb_dump(skb, more, __get_str(buf));
+ ),
+
+ TP_printk("%s\n%s", __get_str(header), __get_str(buf))
+)
+
+#define DEFINE_SKB_EVENT(name) \
+DEFINE_EVENT(tipc_skb_class, name, \
+ TP_PROTO(struct sk_buff *skb, bool more, const char *header), \
+ TP_ARGS(skb, more, header))
+DEFINE_SKB_EVENT(tipc_skb_dump);
+DEFINE_SKB_EVENT(tipc_proto_build);
+DEFINE_SKB_EVENT(tipc_proto_rcv);
+
+DECLARE_EVENT_CLASS(tipc_list_class,
+
+ TP_PROTO(struct sk_buff_head *list, bool more, const char *header),
+
+ TP_ARGS(list, more, header),
+
+ TP_STRUCT__entry(
+ __string(header, header)
+ __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN)
+ ),
+
+ TP_fast_assign(
+ __assign_str(header, header);
+ tipc_list_dump(list, more, __get_str(buf));
+ ),
+
+ TP_printk("%s\n%s", __get_str(header), __get_str(buf))
+);
+
+#define DEFINE_LIST_EVENT(name) \
+DEFINE_EVENT(tipc_list_class, name, \
+ TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \
+ TP_ARGS(list, more, header))
+DEFINE_LIST_EVENT(tipc_list_dump);
+
+DECLARE_EVENT_CLASS(tipc_sk_class,
+
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues,
+ const char *header),
+
+ TP_ARGS(sk, skb, dqueues, header),
+
+ TP_STRUCT__entry(
+ __string(header, header)
+ __field(u32, portid)
+ __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN)
+ __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1)
+ ),
+
+ TP_fast_assign(
+ __assign_str(header, header);
+ __entry->portid = tipc_sock_get_portid(sk);
+ tipc_sk_dump(sk, dqueues, __get_str(buf));
+ if (skb)
+ tipc_skb_dump(skb, false, __get_str(skb_buf));
+ else
+ *(__get_str(skb_buf)) = '\0';
+ ),
+
+ TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header),
+ __get_str(skb_buf), __get_str(buf))
+);
+
+#define DEFINE_SK_EVENT_FILTER(name) \
+DEFINE_EVENT_CONDITION(tipc_sk_class, name, \
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \
+ const char *header), \
+ TP_ARGS(sk, skb, dqueues, header), \
+ TP_CONDITION(tipc_sk_filtering(sk)))
+DEFINE_SK_EVENT_FILTER(tipc_sk_dump);
+DEFINE_SK_EVENT_FILTER(tipc_sk_create);
+DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast);
+DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg);
+DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream);
+DEFINE_SK_EVENT_FILTER(tipc_sk_poll);
+DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv);
+DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx);
+DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg);
+DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg);
+DEFINE_SK_EVENT_FILTER(tipc_sk_release);
+DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown);
+
+#define DEFINE_SK_EVENT_FILTER_COND(name, cond) \
+DEFINE_EVENT_CONDITION(tipc_sk_class, name, \
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \
+ const char *header), \
+ TP_ARGS(sk, skb, dqueues, header), \
+ TP_CONDITION(tipc_sk_filtering(sk) && (cond)))
+DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb));
+DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb));
+
+DECLARE_EVENT_CLASS(tipc_link_class,
+
+ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header),
+
+ TP_ARGS(l, dqueues, header),
+
+ TP_STRUCT__entry(
+ __string(header, header)
+ __array(char, name, TIPC_MAX_LINK_NAME)
+ __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN)
+ ),
+
+ TP_fast_assign(
+ __assign_str(header, header);
+ tipc_link_name_ext(l, __entry->name);
+ tipc_link_dump(l, dqueues, __get_str(buf));
+ ),
+
+ TP_printk("<%s> %s\n%s", __entry->name, __get_str(header),
+ __get_str(buf))
+);
+
+#define DEFINE_LINK_EVENT(name) \
+DEFINE_EVENT(tipc_link_class, name, \
+ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \
+ TP_ARGS(l, dqueues, header))
+DEFINE_LINK_EVENT(tipc_link_dump);
+DEFINE_LINK_EVENT(tipc_link_conges);
+DEFINE_LINK_EVENT(tipc_link_timeout);
+DEFINE_LINK_EVENT(tipc_link_reset);
+
+#define DEFINE_LINK_EVENT_COND(name, cond) \
+DEFINE_EVENT_CONDITION(tipc_link_class, name, \
+ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \
+ TP_ARGS(l, dqueues, header), \
+ TP_CONDITION(cond))
+DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l));
+
+DECLARE_EVENT_CLASS(tipc_link_transmq_class,
+
+ TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
+
+ TP_ARGS(r, f, t, tq),
+
+ TP_STRUCT__entry(
+ __array(char, name, TIPC_MAX_LINK_NAME)
+ __field(u16, from)
+ __field(u16, to)
+ __field(u32, len)
+ __field(u16, fseqno)
+ __field(u16, lseqno)
+ ),
+
+ TP_fast_assign(
+ tipc_link_name_ext(r, __entry->name);
+ __entry->from = f;
+ __entry->to = t;
+ __entry->len = skb_queue_len(tq);
+ __entry->fseqno = msg_seqno(buf_msg(skb_peek(tq)));
+ __entry->lseqno = msg_seqno(buf_msg(skb_peek_tail(tq)));
+ ),
+
+ TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n",
+ __entry->name, __entry->from, __entry->to,
+ __entry->len, __entry->fseqno, __entry->lseqno)
+);
+
+DEFINE_EVENT(tipc_link_transmq_class, tipc_link_retrans,
+ TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
+ TP_ARGS(r, f, t, tq)
+);
+
+DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack,
+ TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
+ TP_ARGS(r, f, t, tq),
+ TP_printk("<%s> acked: [%u-%u] transmq: %u [%u-%u]\n",
+ __entry->name, __entry->from, __entry->to,
+ __entry->len, __entry->fseqno, __entry->lseqno)
+);
+
+DECLARE_EVENT_CLASS(tipc_node_class,
+
+ TP_PROTO(struct tipc_node *n, bool more, const char *header),
+
+ TP_ARGS(n, more, header),
+
+ TP_STRUCT__entry(
+ __string(header, header)
+ __field(u32, addr)
+ __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN)
+ ),
+
+ TP_fast_assign(
+ __assign_str(header, header);
+ __entry->addr = tipc_node_get_addr(n);
+ tipc_node_dump(n, more, __get_str(buf));
+ ),
+
+ TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header),
+ __get_str(buf))
+);
+
+#define DEFINE_NODE_EVENT(name) \
+DEFINE_EVENT(tipc_node_class, name, \
+ TP_PROTO(struct tipc_node *n, bool more, const char *header), \
+ TP_ARGS(n, more, header))
+DEFINE_NODE_EVENT(tipc_node_dump);
+DEFINE_NODE_EVENT(tipc_node_create);
+DEFINE_NODE_EVENT(tipc_node_delete);
+DEFINE_NODE_EVENT(tipc_node_lost_contact);
+DEFINE_NODE_EVENT(tipc_node_timeout);
+DEFINE_NODE_EVENT(tipc_node_link_up);
+DEFINE_NODE_EVENT(tipc_node_link_down);
+DEFINE_NODE_EVENT(tipc_node_reset_links);
+DEFINE_NODE_EVENT(tipc_node_check_state);
+
+DECLARE_EVENT_CLASS(tipc_fsm_class,
+
+ TP_PROTO(const char *name, u32 os, u32 ns, int evt),
+
+ TP_ARGS(name, os, ns, evt),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(u32, os)
+ __field(u32, ns)
+ __field(u32, evt)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->os = os;
+ __entry->ns = ns;
+ __entry->evt = evt;
+ ),
+
+ TP_printk("<%s> %s--(%s)->%s\n", __get_str(name),
+ state_sym(__entry->os), evt_sym(__entry->evt),
+ state_sym(__entry->ns))
+);
+
+#define DEFINE_FSM_EVENT(fsm_name) \
+DEFINE_EVENT(tipc_fsm_class, fsm_name, \
+ TP_PROTO(const char *name, u32 os, u32 ns, int evt), \
+ TP_ARGS(name, os, ns, evt))
+DEFINE_FSM_EVENT(tipc_link_fsm);
+DEFINE_FSM_EVENT(tipc_node_fsm);
+
+TRACE_EVENT(tipc_l2_device_event,
+
+ TP_PROTO(struct net_device *dev, struct tipc_bearer *b,
+ unsigned long evt),
+
+ TP_ARGS(dev, b, evt),
+
+ TP_STRUCT__entry(
+ __string(dev_name, dev->name)
+ __string(b_name, b->name)
+ __field(unsigned long, evt)
+ __field(u8, b_up)
+ __field(u8, carrier)
+ __field(u8, oper)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev_name, dev->name);
+ __assign_str(b_name, b->name);
+ __entry->evt = evt;
+ __entry->b_up = test_bit(0, &b->up);
+ __entry->carrier = netif_carrier_ok(dev);
+ __entry->oper = netif_oper_up(dev);
+ ),
+
+ TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n",
+ dev_evt_sym(__entry->evt), __get_str(dev_name),
+ __get_str(b_name), (__entry->oper) ? "up" : "down",
+ (__entry->carrier) ? "ok" : "notok",
+ (__entry->b_up) ? "up" : "down")
+);
+
+#endif /* _TIPC_TRACE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 10dc59ce9c82..4d85d71f16e2 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -245,10 +245,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
}
err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr);
- if (err) {
- kfree_skb(_skb);
+ if (err)
goto out;
- }
}
err = 0;
out:
@@ -681,6 +679,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
if (err)
goto err;
+ if (remote.proto != local.proto) {
+ err = -EINVAL;
+ goto err;
+ }
+
/* Checking remote ip address */
rmcast = tipc_udp_is_mcast_addr(&remote);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 311cec8e533d..78cb4a584080 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -55,8 +55,10 @@ enum {
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
+static struct proto *saved_tcpv4_prot;
+static DEFINE_MUTEX(tcpv4_prot_mutex);
static LIST_HEAD(device_list);
-static DEFINE_MUTEX(device_mutex);
+static DEFINE_SPINLOCK(device_spinlock);
static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
@@ -538,11 +540,14 @@ static struct tls_context *create_ctx(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
if (!ctx)
return NULL;
icsk->icsk_ulp_data = ctx;
+ ctx->setsockopt = sk->sk_prot->setsockopt;
+ ctx->getsockopt = sk->sk_prot->getsockopt;
+ ctx->sk_proto_close = sk->sk_prot->close;
return ctx;
}
@@ -552,7 +557,7 @@ static int tls_hw_prot(struct sock *sk)
struct tls_device *dev;
int rc = 0;
- mutex_lock(&device_mutex);
+ spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
if (dev->feature && dev->feature(dev)) {
ctx = create_ctx(sk);
@@ -570,7 +575,7 @@ static int tls_hw_prot(struct sock *sk)
}
}
out:
- mutex_unlock(&device_mutex);
+ spin_unlock_bh(&device_spinlock);
return rc;
}
@@ -579,12 +584,17 @@ static void tls_hw_unhash(struct sock *sk)
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_device *dev;
- mutex_lock(&device_mutex);
+ spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
- if (dev->unhash)
+ if (dev->unhash) {
+ kref_get(&dev->kref);
+ spin_unlock_bh(&device_spinlock);
dev->unhash(dev, sk);
+ kref_put(&dev->kref, dev->release);
+ spin_lock_bh(&device_spinlock);
+ }
}
- mutex_unlock(&device_mutex);
+ spin_unlock_bh(&device_spinlock);
ctx->unhash(sk);
}
@@ -595,12 +605,17 @@ static int tls_hw_hash(struct sock *sk)
int err;
err = ctx->hash(sk);
- mutex_lock(&device_mutex);
+ spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
- if (dev->hash)
+ if (dev->hash) {
+ kref_get(&dev->kref);
+ spin_unlock_bh(&device_spinlock);
err |= dev->hash(dev, sk);
+ kref_put(&dev->kref, dev->release);
+ spin_lock_bh(&device_spinlock);
+ }
}
- mutex_unlock(&device_mutex);
+ spin_unlock_bh(&device_spinlock);
if (err)
tls_hw_unhash(sk);
@@ -675,9 +690,6 @@ static int tls_init(struct sock *sk)
rc = -ENOMEM;
goto out;
}
- ctx->setsockopt = sk->sk_prot->setsockopt;
- ctx->getsockopt = sk->sk_prot->getsockopt;
- ctx->sk_proto_close = sk->sk_prot->close;
/* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
if (ip_ver == TLSV6 &&
@@ -690,6 +702,16 @@ static int tls_init(struct sock *sk)
mutex_unlock(&tcpv6_prot_mutex);
}
+ if (ip_ver == TLSV4 &&
+ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) {
+ mutex_lock(&tcpv4_prot_mutex);
+ if (likely(sk->sk_prot != saved_tcpv4_prot)) {
+ build_protos(tls_prots[TLSV4], sk->sk_prot);
+ smp_store_release(&saved_tcpv4_prot, sk->sk_prot);
+ }
+ mutex_unlock(&tcpv4_prot_mutex);
+ }
+
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
update_sk_prot(sk, ctx);
@@ -699,17 +721,17 @@ out:
void tls_register_device(struct tls_device *device)
{
- mutex_lock(&device_mutex);
+ spin_lock_bh(&device_spinlock);
list_add_tail(&device->dev_list, &device_list);
- mutex_unlock(&device_mutex);
+ spin_unlock_bh(&device_spinlock);
}
EXPORT_SYMBOL(tls_register_device);
void tls_unregister_device(struct tls_device *device)
{
- mutex_lock(&device_mutex);
+ spin_lock_bh(&device_spinlock);
list_del(&device->dev_list);
- mutex_unlock(&device_mutex);
+ spin_unlock_bh(&device_spinlock);
}
EXPORT_SYMBOL(tls_unregister_device);
@@ -721,8 +743,6 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
static int __init tls_register(void)
{
- build_protos(tls_prots[TLSV4], &tcp_prot);
-
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7b1af8b59cd2..11cdc8f7db63 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -686,16 +686,24 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
struct sk_psock *psock;
struct sock *sk_redir;
struct tls_rec *rec;
+ bool enospc, policy;
int err = 0, send;
- bool enospc;
+ u32 delta = 0;
+ policy = !(flags & MSG_SENDPAGE_NOPOLICY);
psock = sk_psock_get(sk);
- if (!psock)
+ if (!psock || !policy)
return tls_push_record(sk, flags, record_type);
more_data:
enospc = sk_msg_full(msg);
- if (psock->eval == __SK_NONE)
+ if (psock->eval == __SK_NONE) {
+ delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (delta < msg->sg.size)
+ delta -= msg->sg.size;
+ else
+ delta = 0;
+ }
if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
!enospc && !full_record) {
err = -ENOSPC;
@@ -743,7 +751,7 @@ more_data:
msg->apply_bytes -= send;
if (msg->sg.size == 0)
tls_free_open_rec(sk);
- *copied -= send;
+ *copied -= (send + delta);
err = -EACCES;
}
@@ -935,10 +943,12 @@ fallback_to_reg_send:
tls_ctx->tx.overhead_size);
}
- ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl,
- try_to_copy);
- if (ret < 0)
- goto trim_sgl;
+ if (try_to_copy) {
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter,
+ msg_pl, try_to_copy);
+ if (ret < 0)
+ goto trim_sgl;
+ }
/* Open records defined only if successfully copied, otherwise
* we would trim the sg but not reset the open record frags.
@@ -1010,8 +1020,8 @@ send_end:
return copied ? copied : ret;
}
-int tls_sw_sendpage(struct sock *sk, struct page *page,
- int offset, size_t size, int flags)
+int tls_sw_do_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
{
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -1026,15 +1036,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
int ret = 0;
bool eor;
- if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
- MSG_SENDPAGE_NOTLAST))
- return -ENOTSUPP;
-
- /* No MSG_EOR from splice, only look at MSG_MORE */
eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST));
-
- lock_sock(sk);
-
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* Wait till there is any pending write on socket */
@@ -1138,10 +1140,34 @@ wait_for_memory:
}
sendpage_end:
ret = sk_stream_error(sk, flags, ret);
- release_sock(sk);
return copied ? copied : ret;
}
+int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
+ return -ENOTSUPP;
+
+ return tls_sw_do_sendpage(sk, page, offset, size, flags);
+}
+
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ int ret;
+
+ if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
+ return -ENOTSUPP;
+
+ lock_sock(sk);
+ ret = tls_sw_do_sendpage(sk, page, offset, size, flags);
+ release_sock(sk);
+ return ret;
+}
+
static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
int flags, long timeo, int *err)
{
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ab27a2872935..43a1dec08825 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -107,6 +107,7 @@
#include <linux/mutex.h>
#include <linux/net.h>
#include <linux/poll.h>
+#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/smp.h>
#include <linux/socket.h>
@@ -504,9 +505,13 @@ out:
static int __vsock_bind_stream(struct vsock_sock *vsk,
struct sockaddr_vm *addr)
{
- static u32 port = LAST_RESERVED_PORT + 1;
+ static u32 port = 0;
struct sockaddr_vm new_addr;
+ if (!port)
+ port = LAST_RESERVED_PORT + 1 +
+ prandom_u32_max(U32_MAX - LAST_RESERVED_PORT);
+
vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
if (addr->svm_port == VMADDR_PORT_ANY) {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index cb332adb84cd..c361ce782412 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -264,6 +264,31 @@ vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src,
}
static int
+vmci_transport_alloc_send_control_pkt(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ enum vmci_transport_packet_type type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ u16 proto,
+ struct vmci_handle handle)
+{
+ struct vmci_transport_packet *pkt;
+ int err;
+
+ pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return -ENOMEM;
+
+ err = __vmci_transport_send_control_pkt(pkt, src, dst, type, size,
+ mode, wait, proto, handle,
+ true);
+ kfree(pkt);
+
+ return err;
+}
+
+static int
vmci_transport_send_control_pkt(struct sock *sk,
enum vmci_transport_packet_type type,
u64 size,
@@ -272,9 +297,7 @@ vmci_transport_send_control_pkt(struct sock *sk,
u16 proto,
struct vmci_handle handle)
{
- struct vmci_transport_packet *pkt;
struct vsock_sock *vsk;
- int err;
vsk = vsock_sk(sk);
@@ -284,17 +307,10 @@ vmci_transport_send_control_pkt(struct sock *sk,
if (!vsock_addr_bound(&vsk->remote_addr))
return -EINVAL;
- pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
- if (!pkt)
- return -ENOMEM;
-
- err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr,
- &vsk->remote_addr, type, size,
- mode, wait, proto, handle,
- true);
- kfree(pkt);
-
- return err;
+ return vmci_transport_alloc_send_control_pkt(&vsk->local_addr,
+ &vsk->remote_addr,
+ type, size, mode,
+ wait, proto, handle);
}
static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
@@ -312,12 +328,29 @@ static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
static int vmci_transport_send_reset(struct sock *sk,
struct vmci_transport_packet *pkt)
{
+ struct sockaddr_vm *dst_ptr;
+ struct sockaddr_vm dst;
+ struct vsock_sock *vsk;
+
if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
return 0;
- return vmci_transport_send_control_pkt(sk,
- VMCI_TRANSPORT_PACKET_TYPE_RST,
- 0, 0, NULL, VSOCK_PROTO_INVALID,
- VMCI_INVALID_HANDLE);
+
+ vsk = vsock_sk(sk);
+
+ if (!vsock_addr_bound(&vsk->local_addr))
+ return -EINVAL;
+
+ if (vsock_addr_bound(&vsk->remote_addr)) {
+ dst_ptr = &vsk->remote_addr;
+ } else {
+ vsock_addr_init(&dst, pkt->dg.src.context,
+ pkt->src_port);
+ dst_ptr = &dst;
+ }
+ return vmci_transport_alloc_send_control_pkt(&vsk->local_addr, dst_ptr,
+ VMCI_TRANSPORT_PACKET_TYPE_RST,
+ 0, 0, NULL, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
}
static int vmci_transport_send_negotiate(struct sock *sk, size_t size)
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 1d84f91bbfb0..72a224ce8e0a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-y += pmsr.o
cfg80211-$(CONFIG_OF) += of.o
cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 2db713d18f71..7dc1bbd0888f 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -6,6 +6,7 @@
*
* Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2018 Intel Corporation
*/
#include <linux/export.h>
@@ -747,6 +748,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
case NL80211_CHAN_WIDTH_20:
if (!ht_cap->ht_supported)
return false;
+ /* fall through */
case NL80211_CHAN_WIDTH_20_NOHT:
prohibited_flags |= IEEE80211_CHAN_NO_20MHZ;
width = 20;
@@ -769,6 +771,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
return false;
+ /* fall through */
case NL80211_CHAN_WIDTH_80:
if (!vht_cap->vht_supported)
return false;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5bd01058b9e6..623dfe5e211c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -4,6 +4,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -190,11 +191,25 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
return err;
}
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ if (!wdev->netdev)
+ continue;
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
+ }
+ nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY);
+
wiphy_net_set(&rdev->wiphy, net);
err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
WARN_ON(err);
+ nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ if (!wdev->netdev)
+ continue;
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
+ }
+
return 0;
}
@@ -664,6 +679,34 @@ int wiphy_register(struct wiphy *wiphy)
return -EINVAL;
#endif
+ if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported))
+ return -EINVAL;
+
+ if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) {
+ if (WARN_ON(!wiphy->pmsr_capa->ftm.asap &&
+ !wiphy->pmsr_capa->ftm.non_asap))
+ return -EINVAL;
+ if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles ||
+ !wiphy->pmsr_capa->ftm.bandwidths))
+ return -EINVAL;
+ if (WARN_ON(wiphy->pmsr_capa->ftm.preambles &
+ ~(BIT(NL80211_PREAMBLE_LEGACY) |
+ BIT(NL80211_PREAMBLE_HT) |
+ BIT(NL80211_PREAMBLE_VHT) |
+ BIT(NL80211_PREAMBLE_DMG))))
+ return -EINVAL;
+ if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths &
+ ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) |
+ BIT(NL80211_CHAN_WIDTH_20) |
+ BIT(NL80211_CHAN_WIDTH_40) |
+ BIT(NL80211_CHAN_WIDTH_80) |
+ BIT(NL80211_CHAN_WIDTH_80P80) |
+ BIT(NL80211_CHAN_WIDTH_160) |
+ BIT(NL80211_CHAN_WIDTH_5) |
+ BIT(NL80211_CHAN_WIDTH_10))))
+ return -EINVAL;
+ }
+
/*
* if a wiphy has unsupported modes for regulatory channel enforcement,
* opt-out of enforcement checking
@@ -1087,6 +1130,8 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
ASSERT_RTNL();
ASSERT_WDEV_LOCK(wdev);
+ cfg80211_pmsr_wdev_down(wdev);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
__cfg80211_leave_ibss(rdev, dev, true);
@@ -1174,6 +1219,9 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
spin_lock_init(&wdev->event_lock);
INIT_LIST_HEAD(&wdev->mgmt_registrations);
spin_lock_init(&wdev->mgmt_registrations_lock);
+ INIT_LIST_HEAD(&wdev->pmsr_list);
+ spin_lock_init(&wdev->pmsr_lock);
+ INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
/*
* We get here also when the interface changes network namespaces,
diff --git a/net/wireless/core.h b/net/wireless/core.h
index c61dbba8bf47..c5d6f3418601 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -3,6 +3,7 @@
* Wireless configuration interface internals.
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2018 Intel Corporation
*/
#ifndef __NET_WIRELESS_CORE_H
#define __NET_WIRELESS_CORE_H
@@ -530,4 +531,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
void cfg80211_cqm_config_free(struct wireless_dev *wdev);
+void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid);
+void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev);
+void cfg80211_pmsr_free_wk(struct work_struct *work);
+
#endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index 6beab0cfcb99..55214fe925b2 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -75,7 +75,7 @@ static void *lib80211_ccmp_init(int key_idx)
goto fail;
priv->key_idx = key_idx;
- priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ priv->tfm = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(priv->tfm)) {
priv->tfm = NULL;
goto fail;
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index b5e235573c8a..35f06563207d 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -99,7 +99,7 @@ static void *lib80211_tkip_init(int key_idx)
priv->key_idx = key_idx;
- priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(priv->tx_tfm_arc4)) {
priv->tx_tfm_arc4 = NULL;
goto fail;
@@ -111,7 +111,7 @@ static void *lib80211_tkip_init(int key_idx)
goto fail;
}
- priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(priv->rx_tfm_arc4)) {
priv->rx_tfm_arc4 = NULL;
goto fail;
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 6015f6b542a6..20c1ad63ad44 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
goto fail;
priv->key_idx = keyidx;
- priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ priv->tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(priv->tx_tfm)) {
priv->tx_tfm = NULL;
goto fail;
}
- priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ priv->rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
if (IS_ERR(priv->rx_tfm)) {
priv->rx_tfm = NULL;
goto fail;
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 12b3edf70a7b..1615e503f8e3 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -272,11 +272,11 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
p1 = (u8*)(ht_capa);
p2 = (u8*)(ht_capa_mask);
- for (i = 0; i<sizeof(*ht_capa); i++)
+ for (i = 0; i < sizeof(*ht_capa); i++)
p1[i] &= p2[i];
}
-/* Do a logical ht_capa &= ht_capa_mask. */
+/* Do a logical vht_capa &= vht_capa_mask. */
void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
const struct ieee80211_vht_cap *vht_capa_mask)
{
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 744b5851bbf9..5e49492d5911 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -240,7 +240,63 @@ nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
.len = U8_MAX },
};
-static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
+static const struct nla_policy
+nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
+ [NL80211_PMSR_FTM_REQ_ATTR_ASAP] = { .type = NLA_FLAG },
+ [NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE] = { .type = NLA_U32 },
+ [NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP] =
+ NLA_POLICY_MAX(NLA_U8, 15),
+ [NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD] = { .type = NLA_U16 },
+ [NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] =
+ NLA_POLICY_MAX(NLA_U8, 15),
+ [NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] =
+ NLA_POLICY_MAX(NLA_U8, 15),
+ [NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 },
+ [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG },
+ [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_pmsr_req_data_policy[NL80211_PMSR_TYPE_MAX + 1] = {
+ [NL80211_PMSR_TYPE_FTM] =
+ NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX,
+ nl80211_pmsr_ftm_req_attr_policy),
+};
+
+static const struct nla_policy
+nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = {
+ [NL80211_PMSR_REQ_ATTR_DATA] =
+ NLA_POLICY_NESTED(NL80211_PMSR_TYPE_MAX,
+ nl80211_pmsr_req_data_policy),
+ [NL80211_PMSR_REQ_ATTR_GET_AP_TSF] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = {
+ [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR,
+ /*
+ * we could specify this again to be the top-level policy,
+ * but that would open us up to recursion problems ...
+ */
+ [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED },
+ [NL80211_PMSR_PEER_ATTR_REQ] =
+ NLA_POLICY_NESTED(NL80211_PMSR_REQ_ATTR_MAX,
+ nl80211_pmsr_req_attr_policy),
+ [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy
+nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = {
+ [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT },
+ [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT },
+ [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT },
+ [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT },
+ [NL80211_PMSR_ATTR_PEERS] =
+ NLA_POLICY_NESTED_ARRAY(NL80211_PMSR_PEER_ATTR_MAX,
+ nl80211_psmr_peer_attr_policy),
+};
+
+const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
.len = 20-1 },
@@ -497,6 +553,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
.type = NLA_NESTED,
.validation_data = nl80211_ftm_responder_policy,
},
+ [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NL80211_ATTR_PEER_MEASUREMENTS] =
+ NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX,
+ nl80211_pmsr_attr_policy),
};
/* policy for the key attributes */
@@ -637,9 +697,9 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
[NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
};
-static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
- struct cfg80211_registered_device **rdev,
- struct wireless_dev **wdev)
+int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
+ struct cfg80211_registered_device **rdev,
+ struct wireless_dev **wdev)
{
int err;
@@ -684,8 +744,8 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
}
/* message building helper */
-static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
- int flags, u8 cmd)
+void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd)
{
/* since there is no private header just add the generic one */
return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
@@ -1615,6 +1675,91 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
return -ENOBUFS;
}
+static int
+nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
+ struct sk_buff *msg)
+{
+ struct nlattr *ftm;
+
+ if (!cap->ftm.supported)
+ return 0;
+
+ ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM);
+ if (!ftm)
+ return -ENOBUFS;
+
+ if (cap->ftm.asap && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_ASAP))
+ return -ENOBUFS;
+ if (cap->ftm.non_asap &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP))
+ return -ENOBUFS;
+ if (cap->ftm.request_lci &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI))
+ return -ENOBUFS;
+ if (cap->ftm.request_civicloc &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES,
+ cap->ftm.preambles))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS,
+ cap->ftm.bandwidths))
+ return -ENOBUFS;
+ if (cap->ftm.max_bursts_exponent >= 0 &&
+ nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT,
+ cap->ftm.max_bursts_exponent))
+ return -ENOBUFS;
+ if (cap->ftm.max_ftms_per_burst &&
+ nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
+ cap->ftm.max_ftms_per_burst))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, ftm);
+ return 0;
+}
+
+static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ const struct cfg80211_pmsr_capabilities *cap = rdev->wiphy.pmsr_capa;
+ struct nlattr *pmsr, *caps;
+
+ if (!cap)
+ return 0;
+
+ /*
+ * we don't need to clean up anything here since the caller
+ * will genlmsg_cancel() if we fail
+ */
+
+ pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS);
+ if (!pmsr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_PMSR_ATTR_MAX_PEERS, cap->max_peers))
+ return -ENOBUFS;
+
+ if (cap->report_ap_tsf &&
+ nla_put_flag(msg, NL80211_PMSR_ATTR_REPORT_AP_TSF))
+ return -ENOBUFS;
+
+ if (cap->randomize_mac_addr &&
+ nla_put_flag(msg, NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR))
+ return -ENOBUFS;
+
+ caps = nla_nest_start(msg, NL80211_PMSR_ATTR_TYPE_CAPA);
+ if (!caps)
+ return -ENOBUFS;
+
+ if (nl80211_send_pmsr_ftm_capa(cap, msg))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, caps);
+ nla_nest_end(msg, pmsr);
+
+ return 0;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -1706,6 +1851,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 1:
if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES,
sizeof(u32) * rdev->wiphy.n_cipher_suites,
@@ -1752,6 +1898,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 2:
if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES,
rdev->wiphy.interface_modes))
@@ -1759,6 +1906,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 3:
nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS);
if (!nl_bands)
@@ -1784,6 +1932,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->chan_start++;
if (state->split)
break;
+ /* fall through */
default:
/* add frequencies */
nl_freqs = nla_nest_start(
@@ -1837,6 +1986,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 4:
nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS);
if (!nl_cmds)
@@ -1863,6 +2013,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 5:
if (rdev->ops->remain_on_channel &&
(rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) &&
@@ -1880,6 +2031,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 6:
#ifdef CONFIG_PM
if (nl80211_send_wowlan(msg, rdev, state->split))
@@ -1890,6 +2042,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
#else
state->split_start++;
#endif
+ /* fall through */
case 7:
if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES,
rdev->wiphy.software_iftypes))
@@ -1902,6 +2055,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
state->split_start++;
if (state->split)
break;
+ /* fall through */
case 8:
if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) &&
nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME,
@@ -2118,6 +2272,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
goto nla_put_failure;
}
+ state->split_start++;
+ break;
+ case 14:
+ if (nl80211_send_pmsr_capa(rdev, msg))
+ goto nla_put_failure;
+
/* done */
state->split_start = 0;
break;
@@ -2318,9 +2478,9 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
wdev->iftype == NL80211_IFTYPE_P2P_GO;
}
-static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
- struct genl_info *info,
- struct cfg80211_chan_def *chandef)
+int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
+ struct cfg80211_chan_def *chandef)
{
struct netlink_ext_ack *extack = info->extack;
struct nlattr **attrs = info->attrs;
@@ -2794,12 +2954,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static inline u64 wdev_id(struct wireless_dev *wdev)
-{
- return (u64)wdev->identifier |
- ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
-}
-
static int nl80211_send_chandef(struct sk_buff *msg,
const struct cfg80211_chan_def *chandef)
{
@@ -2832,14 +2986,15 @@ static int nl80211_send_chandef(struct sk_buff *msg,
static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev, bool removal)
+ struct wireless_dev *wdev,
+ enum nl80211_commands cmd)
{
struct net_device *dev = wdev->netdev;
- u8 cmd = NL80211_CMD_NEW_INTERFACE;
void *hdr;
- if (removal)
- cmd = NL80211_CMD_DEL_INTERFACE;
+ WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE &&
+ cmd != NL80211_CMD_DEL_INTERFACE &&
+ cmd != NL80211_CMD_SET_INTERFACE);
hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
@@ -2987,7 +3142,8 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
}
if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- rdev, wdev, false) < 0) {
+ rdev, wdev,
+ NL80211_CMD_NEW_INTERFACE) < 0) {
goto out;
}
if_idx++;
@@ -3017,7 +3173,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
return -ENOMEM;
if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
- rdev, wdev, false) < 0) {
+ rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) {
nlmsg_free(msg);
return -ENOBUFS;
}
@@ -3207,6 +3363,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
if (!err && params.use_4addr != -1)
dev->ieee80211_ptr->use_4addr = params.use_4addr;
+ if (change && !err) {
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE);
+ }
+
return err;
}
@@ -3298,7 +3460,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
}
if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
- rdev, wdev, false) < 0) {
+ rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) {
nlmsg_free(msg);
return -ENOBUFS;
}
@@ -4521,8 +4683,7 @@ static int parse_station_flags(struct genl_info *info,
return 0;
}
-static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
- int attr)
+bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
{
struct nlattr *rate;
u32 bitrate;
@@ -4731,6 +4892,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(LOCAL_PM, local_pm, u32);
PUT_SINFO(PEER_PM, peer_pm, u32);
PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
+ PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8);
if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
@@ -6122,7 +6284,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW,
cur_params.dot11MeshAwakeWindowDuration) ||
nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT,
- cur_params.plink_timeout))
+ cur_params.plink_timeout) ||
+ nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE,
+ cur_params.dot11MeshConnectedToMeshGate))
goto nla_put_failure;
nla_nest_end(msg, pinfoattr);
genlmsg_end(msg, hdr);
@@ -6179,6 +6343,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
NL80211_MESH_POWER_MAX),
[NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
[NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
};
static const struct nla_policy
@@ -6290,6 +6455,9 @@ do { \
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask,
NL80211_MESHCONF_RSSI_THRESHOLD,
nla_get_s32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask,
+ NL80211_MESHCONF_CONNECTED_TO_GATE,
+ nla_get_u8);
/*
* Check HT operation mode based on
* IEEE 802.11-2016 9.4.2.57 HT Operation element.
@@ -6855,8 +7023,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
return 0;
}
-static int nl80211_parse_random_mac(struct nlattr **attrs,
- u8 *mac_addr, u8 *mac_addr_mask)
+int nl80211_parse_random_mac(struct nlattr **attrs,
+ u8 *mac_addr, u8 *mac_addr_mask)
{
int i;
@@ -7822,6 +7990,60 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
return err;
}
+static int nl80211_notify_radar_detection(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_chan_def chandef;
+ enum nl80211_dfs_regions dfs_region;
+ int err;
+
+ dfs_region = reg_get_dfs_region(wiphy);
+ if (dfs_region == NL80211_DFS_UNSET) {
+ GENL_SET_ERR_MSG(info,
+ "DFS Region is not set. Unexpected Radar indication");
+ return -EINVAL;
+ }
+
+ err = nl80211_parse_chandef(rdev, info, &chandef);
+ if (err) {
+ GENL_SET_ERR_MSG(info, "Unable to extract chandef info");
+ return err;
+ }
+
+ err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "chandef is invalid");
+ return err;
+ }
+
+ if (err == 0) {
+ GENL_SET_ERR_MSG(info,
+ "Unexpected Radar indication for chandef/iftype");
+ return -EINVAL;
+ }
+
+ /* Do not process this notification if radar is already detected
+ * by kernel on this channel, and return success.
+ */
+ if (chandef.chan->dfs_state == NL80211_DFS_UNAVAILABLE)
+ return 0;
+
+ cfg80211_set_dfs_state(wiphy, &chandef, NL80211_DFS_UNAVAILABLE);
+
+ cfg80211_sched_dfs_chan_update(rdev);
+
+ memcpy(&rdev->radar_chandef, &chandef, sizeof(chandef));
+
+ /* Propagate this notification to other radios as well */
+ queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk);
+
+ return 0;
+}
+
static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -7870,6 +8092,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
}
memset(&params, 0, sizeof(params));
+ params.beacon_csa.ftm_responder = -1;
if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
!info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])
@@ -8929,8 +9152,10 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
int r = validate_pae_over_nl80211(rdev, info);
- if (r < 0)
+ if (r < 0) {
+ kzfree(connkeys);
return r;
+ }
ibss.control_port_over_nl80211 = true;
}
@@ -13898,6 +14123,22 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_PEER_MEASUREMENT_START,
+ .doit = nl80211_pmsr_start,
+ .policy = nl80211_policy,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_NOTIFY_RADAR,
+ .doit = nl80211_notify_radar_detection,
+ .policy = nl80211_policy,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
};
static struct genl_family nl80211_fam __ro_after_init = {
@@ -13945,15 +14186,11 @@ void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
{
struct sk_buff *msg;
- WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE &&
- cmd != NL80211_CMD_DEL_INTERFACE);
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev,
- cmd == NL80211_CMD_DEL_INTERFACE) < 0) {
+ if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, cmd) < 0) {
nlmsg_free(msg);
return;
}
@@ -14572,7 +14809,8 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
}
void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
- const u8* ie, u8 ie_len, gfp_t gfp)
+ const u8 *ie, u8 ie_len,
+ int sig_dbm, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -14598,7 +14836,9 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
(ie_len && ie &&
- nla_put(msg, NL80211_ATTR_IE, ie_len , ie)))
+ nla_put(msg, NL80211_ATTR_IE, ie_len, ie)) ||
+ (sig_dbm &&
+ nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -15881,6 +16121,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
} else if (wdev->conn_owner_nlportid == notify->portid) {
schedule_work(&wdev->disconnect_wk);
}
+
+ cfg80211_release_pmsr(wdev, notify->portid);
}
spin_lock_bh(&rdev->beacon_registrations_lock);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 79e47fe60c35..531c82dcba6b 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -1,4 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright (C) 2018 Intel Corporation
+ */
#ifndef __NET_WIRELESS_NL80211_H
#define __NET_WIRELESS_NL80211_H
@@ -6,6 +10,30 @@
int nl80211_init(void);
void nl80211_exit(void);
+
+extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
+
+void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd);
+bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
+ int attr);
+
+static inline u64 wdev_id(struct wireless_dev *wdev)
+{
+ return (u64)wdev->identifier |
+ ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
+}
+
+int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
+ struct cfg80211_registered_device **rdev,
+ struct wireless_dev **wdev);
+
+int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
+ struct cfg80211_chan_def *chandef);
+int nl80211_parse_random_mac(struct nlattr **attrs,
+ u8 *mac_addr, u8 *mac_addr_mask);
+
void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
enum nl80211_commands cmd);
void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
@@ -95,4 +123,8 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev);
void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);
+/* peer measurement */
+int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info);
+int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb);
+
#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
new file mode 100644
index 000000000000..de9286703280
--- /dev/null
+++ b/net/wireless/pmsr.c
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Intel Corporation
+ */
+#ifndef __PMSR_H
+#define __PMSR_H
+#include <net/cfg80211.h>
+#include "core.h"
+#include "nl80211.h"
+#include "rdev-ops.h"
+
+static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
+ struct nlattr *ftmreq,
+ struct cfg80211_pmsr_request_peer *out,
+ struct genl_info *info)
+{
+ const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa;
+ struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1];
+ u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */
+
+ /* validate existing data */
+ if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) {
+ NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth");
+ return -EINVAL;
+ }
+
+ /* no validation needed - was already done via nested policy */
+ nla_parse_nested(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL);
+
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE])
+ preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]);
+
+ /* set up values - struct is 0-initialized */
+ out->ftm.requested = true;
+
+ switch (out->chandef.chan->band) {
+ case NL80211_BAND_60GHZ:
+ /* optional */
+ break;
+ default:
+ if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) {
+ NL_SET_ERR_MSG(info->extack,
+ "FTM: must specify preamble");
+ return -EINVAL;
+ }
+ }
+
+ if (!(capa->ftm.preambles & BIT(preamble))) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE],
+ "FTM: invalid preamble");
+ return -EINVAL;
+ }
+
+ out->ftm.preamble = preamble;
+
+ out->ftm.burst_period = 0;
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD])
+ out->ftm.burst_period =
+ nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]);
+
+ out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP];
+ if (out->ftm.asap && !capa->ftm.asap) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP],
+ "FTM: ASAP mode not supported");
+ return -EINVAL;
+ }
+
+ if (!out->ftm.asap && !capa->ftm.non_asap) {
+ NL_SET_ERR_MSG(info->extack,
+ "FTM: non-ASAP mode not supported");
+ return -EINVAL;
+ }
+
+ out->ftm.num_bursts_exp = 0;
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP])
+ out->ftm.num_bursts_exp =
+ nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]);
+
+ if (capa->ftm.max_bursts_exponent >= 0 &&
+ out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP],
+ "FTM: max NUM_BURSTS_EXP must be set lower than the device limit");
+ return -EINVAL;
+ }
+
+ out->ftm.burst_duration = 15;
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
+ out->ftm.burst_duration =
+ nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
+
+ out->ftm.ftms_per_burst = 0;
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
+ out->ftm.ftms_per_burst =
+ nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
+
+ if (capa->ftm.max_ftms_per_burst &&
+ (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst ||
+ out->ftm.ftms_per_burst == 0)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST],
+ "FTM: FTMs per burst must be set lower than the device limit but non-zero");
+ return -EINVAL;
+ }
+
+ out->ftm.ftmr_retries = 3;
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES])
+ out->ftm.ftmr_retries =
+ nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]);
+
+ out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI];
+ if (out->ftm.request_lci && !capa->ftm.request_lci) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI],
+ "FTM: LCI request not supported");
+ }
+
+ out->ftm.request_civicloc =
+ !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC];
+ if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC],
+ "FTM: civic location request not supported");
+ }
+
+ return 0;
+}
+
+static int pmsr_parse_peer(struct cfg80211_registered_device *rdev,
+ struct nlattr *peer,
+ struct cfg80211_pmsr_request_peer *out,
+ struct genl_info *info)
+{
+ struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1];
+ struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1];
+ struct nlattr *treq;
+ int err, rem;
+
+ /* no validation needed - was already done via nested policy */
+ nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL);
+
+ if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] ||
+ !tb[NL80211_PMSR_PEER_ATTR_CHAN] ||
+ !tb[NL80211_PMSR_PEER_ATTR_REQ]) {
+ NL_SET_ERR_MSG_ATTR(info->extack, peer,
+ "insufficient peer data");
+ return -EINVAL;
+ }
+
+ memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN);
+
+ /* reuse info->attrs */
+ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1));
+ /* need to validate here, we don't want to have validation recursion */
+ err = nla_parse_nested(info->attrs, NL80211_ATTR_MAX,
+ tb[NL80211_PMSR_PEER_ATTR_CHAN],
+ nl80211_policy, info->extack);
+ if (err)
+ return err;
+
+ err = nl80211_parse_chandef(rdev, info, &out->chandef);
+ if (err)
+ return err;
+
+ /* no validation needed - was already done via nested policy */
+ nla_parse_nested(req, NL80211_PMSR_REQ_ATTR_MAX,
+ tb[NL80211_PMSR_PEER_ATTR_REQ],
+ NULL, NULL);
+
+ if (!req[NL80211_PMSR_REQ_ATTR_DATA]) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_PEER_ATTR_REQ],
+ "missing request type/data");
+ return -EINVAL;
+ }
+
+ if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF])
+ out->report_ap_tsf = true;
+
+ if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF],
+ "reporting AP TSF is not supported");
+ return -EINVAL;
+ }
+
+ nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) {
+ switch (nla_type(treq)) {
+ case NL80211_PMSR_TYPE_FTM:
+ err = pmsr_parse_ftm(rdev, treq, out, info);
+ break;
+ default:
+ NL_SET_ERR_MSG_ATTR(info->extack, treq,
+ "unsupported measurement type");
+ err = -EINVAL;
+ }
+ }
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS];
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_pmsr_request *req;
+ struct nlattr *peers, *peer;
+ int count, rem, err, idx;
+
+ if (!rdev->wiphy.pmsr_capa)
+ return -EOPNOTSUPP;
+
+ if (!reqattr)
+ return -EINVAL;
+
+ peers = nla_find(nla_data(reqattr), nla_len(reqattr),
+ NL80211_PMSR_ATTR_PEERS);
+ if (!peers)
+ return -EINVAL;
+
+ count = 0;
+ nla_for_each_nested(peer, peers, rem) {
+ count++;
+
+ if (count > rdev->wiphy.pmsr_capa->max_peers) {
+ NL_SET_ERR_MSG_ATTR(info->extack, peer,
+ "Too many peers used");
+ return -EINVAL;
+ }
+ }
+
+ req = kzalloc(struct_size(req, peers, count), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ if (info->attrs[NL80211_ATTR_TIMEOUT])
+ req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]);
+
+ if (info->attrs[NL80211_ATTR_MAC]) {
+ if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ info->attrs[NL80211_ATTR_MAC],
+ "device cannot randomize MAC address");
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ err = nl80211_parse_random_mac(info->attrs, req->mac_addr,
+ req->mac_addr_mask);
+ if (err)
+ goto out_err;
+ } else {
+ memcpy(req->mac_addr, nla_data(info->attrs[NL80211_ATTR_MAC]),
+ ETH_ALEN);
+ memset(req->mac_addr_mask, 0xff, ETH_ALEN);
+ }
+
+ idx = 0;
+ nla_for_each_nested(peer, peers, rem) {
+ /* NB: this reuses info->attrs, but we no longer need it */
+ err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info);
+ if (err)
+ goto out_err;
+ idx++;
+ }
+
+ req->n_peers = count;
+ req->cookie = cfg80211_assign_cookie(rdev);
+
+ err = rdev_start_pmsr(rdev, wdev, req);
+ if (err)
+ goto out_err;
+
+ list_add_tail(&req->list, &wdev->pmsr_list);
+
+ nl_set_extack_cookie_u64(info->extack, req->cookie);
+ return 0;
+out_err:
+ kfree(req);
+ return err;
+}
+
+void cfg80211_pmsr_complete(struct wireless_dev *wdev,
+ struct cfg80211_pmsr_request *req,
+ gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ goto free_request;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0,
+ NL80211_CMD_PEER_MEASUREMENT_COMPLETE);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
+ goto free_msg;
+
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
+ NL80211_ATTR_PAD))
+ goto free_msg;
+
+ genlmsg_end(msg, hdr);
+ genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
+ goto free_request;
+free_msg:
+ nlmsg_free(msg);
+free_request:
+ spin_lock_bh(&wdev->pmsr_lock);
+ list_del(&req->list);
+ spin_unlock_bh(&wdev->pmsr_lock);
+ kfree(req);
+}
+EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete);
+
+static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg,
+ struct cfg80211_pmsr_result *res)
+{
+ if (res->status == NL80211_PMSR_STATUS_FAILURE) {
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON,
+ res->ftm.failure_reason))
+ goto error;
+
+ if (res->ftm.failure_reason ==
+ NL80211_PMSR_FTM_FAILURE_PEER_BUSY &&
+ res->ftm.busy_retry_time &&
+ nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME,
+ res->ftm.busy_retry_time))
+ goto error;
+
+ return 0;
+ }
+
+#define PUT(tp, attr, val) \
+ do { \
+ if (nla_put_##tp(msg, \
+ NL80211_PMSR_FTM_RESP_ATTR_##attr, \
+ res->ftm.val)) \
+ goto error; \
+ } while (0)
+
+#define PUTOPT(tp, attr, val) \
+ do { \
+ if (res->ftm.val##_valid) \
+ PUT(tp, attr, val); \
+ } while (0)
+
+#define PUT_U64(attr, val) \
+ do { \
+ if (nla_put_u64_64bit(msg, \
+ NL80211_PMSR_FTM_RESP_ATTR_##attr,\
+ res->ftm.val, \
+ NL80211_PMSR_FTM_RESP_ATTR_PAD)) \
+ goto error; \
+ } while (0)
+
+#define PUTOPT_U64(attr, val) \
+ do { \
+ if (res->ftm.val##_valid) \
+ PUT_U64(attr, val); \
+ } while (0)
+
+ if (res->ftm.burst_index >= 0)
+ PUT(u32, BURST_INDEX, burst_index);
+ PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts);
+ PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes);
+ PUT(u8, NUM_BURSTS_EXP, num_bursts_exp);
+ PUT(u8, BURST_DURATION, burst_duration);
+ PUT(u8, FTMS_PER_BURST, ftms_per_burst);
+ PUTOPT(s32, RSSI_AVG, rssi_avg);
+ PUTOPT(s32, RSSI_SPREAD, rssi_spread);
+ if (res->ftm.tx_rate_valid &&
+ !nl80211_put_sta_rate(msg, &res->ftm.tx_rate,
+ NL80211_PMSR_FTM_RESP_ATTR_TX_RATE))
+ goto error;
+ if (res->ftm.rx_rate_valid &&
+ !nl80211_put_sta_rate(msg, &res->ftm.rx_rate,
+ NL80211_PMSR_FTM_RESP_ATTR_RX_RATE))
+ goto error;
+ PUTOPT_U64(RTT_AVG, rtt_avg);
+ PUTOPT_U64(RTT_VARIANCE, rtt_variance);
+ PUTOPT_U64(RTT_SPREAD, rtt_spread);
+ PUTOPT_U64(DIST_AVG, dist_avg);
+ PUTOPT_U64(DIST_VARIANCE, dist_variance);
+ PUTOPT_U64(DIST_SPREAD, dist_spread);
+ if (res->ftm.lci && res->ftm.lci_len &&
+ nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI,
+ res->ftm.lci_len, res->ftm.lci))
+ goto error;
+ if (res->ftm.civicloc && res->ftm.civicloc_len &&
+ nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC,
+ res->ftm.civicloc_len, res->ftm.civicloc))
+ goto error;
+#undef PUT
+#undef PUTOPT
+#undef PUT_U64
+#undef PUTOPT_U64
+
+ return 0;
+error:
+ return -ENOSPC;
+}
+
+static int nl80211_pmsr_send_result(struct sk_buff *msg,
+ struct cfg80211_pmsr_result *res)
+{
+ struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata;
+
+ pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS);
+ if (!pmsr)
+ goto error;
+
+ peers = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS);
+ if (!peers)
+ goto error;
+
+ peer = nla_nest_start(msg, 1);
+ if (!peer)
+ goto error;
+
+ if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr))
+ goto error;
+
+ resp = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_RESP);
+ if (!resp)
+ goto error;
+
+ if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) ||
+ nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME,
+ res->host_time, NL80211_PMSR_RESP_ATTR_PAD))
+ goto error;
+
+ if (res->ap_tsf_valid &&
+ nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF,
+ res->host_time, NL80211_PMSR_RESP_ATTR_PAD))
+ goto error;
+
+ if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL))
+ goto error;
+
+ data = nla_nest_start(msg, NL80211_PMSR_RESP_ATTR_DATA);
+ if (!data)
+ goto error;
+
+ typedata = nla_nest_start(msg, res->type);
+ if (!typedata)
+ goto error;
+
+ switch (res->type) {
+ case NL80211_PMSR_TYPE_FTM:
+ if (nl80211_pmsr_send_ftm_res(msg, res))
+ goto error;
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ nla_nest_end(msg, typedata);
+ nla_nest_end(msg, data);
+ nla_nest_end(msg, resp);
+ nla_nest_end(msg, peer);
+ nla_nest_end(msg, peers);
+ nla_nest_end(msg, pmsr);
+
+ return 0;
+error:
+ return -ENOSPC;
+}
+
+void cfg80211_pmsr_report(struct wireless_dev *wdev,
+ struct cfg80211_pmsr_request *req,
+ struct cfg80211_pmsr_result *result,
+ gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie,
+ result->addr);
+
+ /*
+ * Currently, only variable items are LCI and civic location,
+ * both of which are reasonably short so we don't need to
+ * worry about them here for the allocation.
+ */
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT);
+ if (!hdr)
+ goto free;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
+ goto free;
+
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
+ NL80211_ATTR_PAD))
+ goto free;
+
+ err = nl80211_pmsr_send_result(msg, result);
+ if (err) {
+ pr_err_ratelimited("peer measurement result: message didn't fit!");
+ goto free;
+ }
+
+ genlmsg_end(msg, hdr);
+ genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
+ return;
+free:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL_GPL(cfg80211_pmsr_report);
+
+void cfg80211_pmsr_free_wk(struct work_struct *work)
+{
+ struct wireless_dev *wdev = container_of(work, struct wireless_dev,
+ pmsr_free_wk);
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_pmsr_request *req, *tmp;
+ LIST_HEAD(free_list);
+
+ spin_lock_bh(&wdev->pmsr_lock);
+ list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) {
+ if (req->nl_portid)
+ continue;
+ list_move_tail(&req->list, &free_list);
+ }
+ spin_unlock_bh(&wdev->pmsr_lock);
+
+ list_for_each_entry_safe(req, tmp, &free_list, list) {
+ wdev_lock(wdev);
+ rdev_abort_pmsr(rdev, wdev, req);
+ wdev_unlock(wdev);
+
+ kfree(req);
+ }
+}
+
+void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev)
+{
+ struct cfg80211_pmsr_request *req;
+ bool found = false;
+
+ spin_lock_bh(&wdev->pmsr_lock);
+ list_for_each_entry(req, &wdev->pmsr_list, list) {
+ found = true;
+ req->nl_portid = 0;
+ }
+ spin_unlock_bh(&wdev->pmsr_lock);
+
+ if (found)
+ schedule_work(&wdev->pmsr_free_wk);
+ flush_work(&wdev->pmsr_free_wk);
+ WARN_ON(!list_empty(&wdev->pmsr_list));
+}
+
+void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid)
+{
+ struct cfg80211_pmsr_request *req;
+
+ spin_lock_bh(&wdev->pmsr_lock);
+ list_for_each_entry(req, &wdev->pmsr_list, list) {
+ if (req->nl_portid == portid) {
+ req->nl_portid = 0;
+ schedule_work(&wdev->pmsr_free_wk);
+ }
+ }
+ spin_unlock_bh(&wdev->pmsr_lock);
+}
+
+#endif /* __PMSR_H */
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 51380b5c32f2..5cb48d135fab 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1247,4 +1247,29 @@ rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_start_pmsr(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_pmsr_request *request)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie);
+ if (rdev->ops->start_pmsr)
+ ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void
+rdev_abort_pmsr(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_pmsr_request *request)
+{
+ trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie);
+ if (rdev->ops->abort_pmsr)
+ rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index d0e7472dd9fd..5123667f4569 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1183,7 +1183,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
switch (ftype) {
case CFG80211_BSS_FTYPE_BEACON:
ies->from_beacon = true;
- /* fall through to assign */
+ /* fall through */
case CFG80211_BSS_FTYPE_UNKNOWN:
rcu_assign_pointer(tmp.pub.beacon_ies, ies);
break;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index d536b07582f8..f741d8376a46 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -642,11 +642,15 @@ static bool cfg80211_is_all_idle(void)
* All devices must be idle as otherwise if you are actively
* scanning some new beacon hints could be learned and would
* count as new regulatory hints.
+ * Also if there is any other active beaconing interface we
+ * need not issue a disconnect hint and reset any info such
+ * as chan dfs state, etc.
*/
list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
wdev_lock(wdev);
- if (wdev->conn || wdev->current_bss)
+ if (wdev->conn || wdev->current_bss ||
+ cfg80211_beaconing_iface_active(wdev))
is_all_idle = false;
wdev_unlock(wdev);
}
@@ -1171,6 +1175,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
cfg80211_oper_and_ht_capa(&connect->ht_capa_mask,
rdev->wiphy.ht_capa_mod_mask);
+ cfg80211_oper_and_vht_capa(&connect->vht_capa_mask,
+ rdev->wiphy.vht_capa_mod_mask);
if (connkeys && connkeys->def >= 0) {
int idx;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index c6a9446b4e6b..44b2ce1bb13a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -361,6 +361,24 @@ DECLARE_EVENT_CLASS(wiphy_wdev_evt,
TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);
+DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld",
+ WIPHY_PR_ARG, WDEV_PR_ARG,
+ (unsigned long long)__entry->cookie)
+);
+
DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
TP_ARGS(wiphy, wdev)
@@ -770,9 +788,9 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer,
);
TRACE_EVENT(rdev_dump_station,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
u8 *mac),
- TP_ARGS(wiphy, netdev, idx, mac),
+ TP_ARGS(wiphy, netdev, _idx, mac),
TP_STRUCT__entry(
WIPHY_ENTRY
NETDEV_ENTRY
@@ -783,7 +801,7 @@ TRACE_EVENT(rdev_dump_station,
WIPHY_ASSIGN;
NETDEV_ASSIGN;
MAC_ASSIGN(sta_mac, mac);
- __entry->idx = idx;
+ __entry->idx = _idx;
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", idx: %d",
WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
@@ -847,9 +865,9 @@ DEFINE_EVENT(mpath_evt, rdev_get_mpath,
);
TRACE_EVENT(rdev_dump_mpath,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
u8 *dst, u8 *next_hop),
- TP_ARGS(wiphy, netdev, idx, dst, next_hop),
+ TP_ARGS(wiphy, netdev, _idx, dst, next_hop),
TP_STRUCT__entry(
WIPHY_ENTRY
NETDEV_ENTRY
@@ -862,7 +880,7 @@ TRACE_EVENT(rdev_dump_mpath,
NETDEV_ASSIGN;
MAC_ASSIGN(dst, dst);
MAC_ASSIGN(next_hop, next_hop);
- __entry->idx = idx;
+ __entry->idx = _idx;
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
MAC_PR_FMT ", next hop: " MAC_PR_FMT,
@@ -892,9 +910,9 @@ TRACE_EVENT(rdev_get_mpp,
);
TRACE_EVENT(rdev_dump_mpp,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
u8 *dst, u8 *mpp),
- TP_ARGS(wiphy, netdev, idx, mpp, dst),
+ TP_ARGS(wiphy, netdev, _idx, mpp, dst),
TP_STRUCT__entry(
WIPHY_ENTRY
NETDEV_ENTRY
@@ -907,7 +925,7 @@ TRACE_EVENT(rdev_dump_mpp,
NETDEV_ASSIGN;
MAC_ASSIGN(dst, dst);
MAC_ASSIGN(mpp, mpp);
- __entry->idx = idx;
+ __entry->idx = _idx;
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
MAC_PR_FMT ", mpp: " MAC_PR_FMT,
@@ -1673,8 +1691,8 @@ TRACE_EVENT(rdev_tdls_mgmt,
);
TRACE_EVENT(rdev_dump_survey,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx),
- TP_ARGS(wiphy, netdev, idx),
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx),
+ TP_ARGS(wiphy, netdev, _idx),
TP_STRUCT__entry(
WIPHY_ENTRY
NETDEV_ENTRY
@@ -1683,7 +1701,7 @@ TRACE_EVENT(rdev_dump_survey,
TP_fast_assign(
WIPHY_ASSIGN;
NETDEV_ASSIGN;
- __entry->idx = idx;
+ __entry->idx = _idx;
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d",
WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx)
@@ -2502,6 +2520,16 @@ TRACE_EVENT(rdev_get_ftm_responder_stats,
__entry->out_of_window)
);
+DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie)
+);
+
+DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie)
+);
+
/*************************************************************
* cfg80211 exported functions traces *
*************************************************************/
@@ -3294,6 +3322,46 @@ TRACE_EVENT(cfg80211_stop_iface,
TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
WIPHY_PR_ARG, WDEV_PR_ARG)
);
+
+TRACE_EVENT(cfg80211_pmsr_report,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ u64 cookie, const u8 *addr),
+ TP_ARGS(wiphy, wdev, cookie, addr),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ MAC_ENTRY(addr)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ MAC_ASSIGN(addr, addr);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, " MAC_PR_FMT,
+ WIPHY_PR_ARG, WDEV_PR_ARG,
+ (unsigned long long)__entry->cookie,
+ MAC_PR_ARG(addr))
+);
+
+TRACE_EVENT(cfg80211_pmsr_complete,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld",
+ WIPHY_PR_ARG, WDEV_PR_ARG,
+ (unsigned long long)__entry->cookie)
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index ef14d80ca03e..cd48cdd582c0 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1421,6 +1421,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
ies[pos + ext],
ext == 2))
pos = skip_ie(ies, ielen, pos);
+ else
+ break;
}
} else {
pos = skip_ie(ies, ielen, pos);
@@ -2013,33 +2015,32 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
case IEEE80211_VHT_CHANWIDTH_160MHZ:
if (supp_width == 0 &&
(ext_nss_bw == 1 || ext_nss_bw == 2))
- return DIV_ROUND_UP(max_vht_nss, 2);
+ return max_vht_nss / 2;
if (supp_width == 0 &&
ext_nss_bw == 3)
- return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ return (3 * max_vht_nss) / 4;
if (supp_width == 1 &&
ext_nss_bw == 3)
return 2 * max_vht_nss;
break;
case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
- if (supp_width == 0 &&
- (ext_nss_bw == 1 || ext_nss_bw == 2))
+ if (supp_width == 0 && ext_nss_bw == 1)
return 0; /* not possible */
if (supp_width == 0 &&
ext_nss_bw == 2)
- return DIV_ROUND_UP(max_vht_nss, 2);
+ return max_vht_nss / 2;
if (supp_width == 0 &&
ext_nss_bw == 3)
- return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ return (3 * max_vht_nss) / 4;
if (supp_width == 1 &&
ext_nss_bw == 0)
return 0; /* not possible */
if (supp_width == 1 &&
ext_nss_bw == 1)
- return DIV_ROUND_UP(max_vht_nss, 2);
+ return max_vht_nss / 2;
if (supp_width == 1 &&
ext_nss_bw == 2)
- return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ return (3 * max_vht_nss) / 4;
break;
}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d49aa79b7997..5121729b8b63 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb,
}
len = *skb->data;
- needed = 1 + (len >> 4) + (len & 0x0f);
+ needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2;
if (!pskb_may_pull(skb, needed)) {
/* packet is too short to hold the addresses it claims
@@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr,
sk_for_each(s, &x25_list)
if ((!strcmp(addr->x25_addr,
x25_sk(s)->source_addr.x25_addr) ||
- !strcmp(addr->x25_addr,
+ !strcmp(x25_sk(s)->source_addr.x25_addr,
null_x25_address.x25_addr)) &&
s->sk_state == TCP_LISTEN) {
/*
@@ -688,11 +688,15 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- len = strlen(addr->sx25_addr.x25_addr);
- for (i = 0; i < len; i++) {
- if (!isdigit(addr->sx25_addr.x25_addr[i])) {
- rc = -EINVAL;
- goto out;
+ /* check for the null_x25_address */
+ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) {
+
+ len = strlen(addr->sx25_addr.x25_addr);
+ for (i = 0; i < len; i++) {
+ if (!isdigit(addr->sx25_addr.x25_addr[i])) {
+ rc = -EINVAL;
+ goto out;
+ }
}
}
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 3c12cae32001..afb26221d8a8 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -142,6 +142,15 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
sk->sk_state_change(sk);
break;
}
+ case X25_CALL_REQUEST:
+ /* call collision */
+ x25->causediag.cause = 0x01;
+ x25->causediag.diagnostic = 0x48;
+
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25_disconnect(sk, EISCONN, 0x01, 0x48);
+ break;
+
case X25_CLEAR_REQUEST:
if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
goto out_clear;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 07156f43d295..a03268454a27 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -366,6 +366,7 @@ static int xsk_release(struct socket *sock)
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
+ xdp_put_umem(xs->umem);
sock_orphan(sk);
sock->sk = NULL;
@@ -713,18 +714,6 @@ static const struct proto_ops xsk_proto_ops = {
.sendpage = sock_no_sendpage,
};
-static void xsk_destruct(struct sock *sk)
-{
- struct xdp_sock *xs = xdp_sk(sk);
-
- if (!sock_flag(sk, SOCK_DEAD))
- return;
-
- xdp_put_umem(xs->umem);
-
- sk_refcnt_debug_dec(sk);
-}
-
static int xsk_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
@@ -751,9 +740,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sk->sk_family = PF_XDP;
- sk->sk_destruct = xsk_destruct;
- sk_refcnt_debug_inc(sk);
-
sock_set_flag(sk, SOCK_RCU_FREE);
xs = xdp_sk(sk);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 140270a13d54..5d43aaa17027 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -5,6 +5,7 @@ config XFRM
bool
depends on NET
select GRO_CELLS
+ select SKB_EXTENSIONS
config XFRM_OFFLOAD
bool
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 144c137886b1..b8736f56e7f7 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -32,6 +32,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
struct softnet_data *sd;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return skb;
@@ -39,7 +40,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
if (!(features & NETIF_F_HW_ESP))
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND)
return skb;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 684c0bc01e2c..b3b613660d44 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -38,8 +38,6 @@ struct xfrm_trans_cb {
#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
-static struct kmem_cache *secpath_cachep __ro_after_init;
-
static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
@@ -111,56 +109,24 @@ static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
return ret;
}
-void __secpath_destroy(struct sec_path *sp)
+struct sec_path *secpath_set(struct sk_buff *skb)
{
- int i;
- for (i = 0; i < sp->len; i++)
- xfrm_state_put(sp->xvec[i]);
- kmem_cache_free(secpath_cachep, sp);
-}
-EXPORT_SYMBOL(__secpath_destroy);
+ struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH);
-struct sec_path *secpath_dup(struct sec_path *src)
-{
- struct sec_path *sp;
-
- sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC);
+ sp = skb_ext_add(skb, SKB_EXT_SEC_PATH);
if (!sp)
return NULL;
- sp->len = 0;
- sp->olen = 0;
+ if (tmp) /* reused existing one (was COW'd if needed) */
+ return sp;
+ /* allocated new secpath */
memset(sp->ovec, 0, sizeof(sp->ovec));
+ sp->olen = 0;
+ sp->len = 0;
- if (src) {
- int i;
-
- memcpy(sp, src, sizeof(*sp));
- for (i = 0; i < sp->len; i++)
- xfrm_state_hold(sp->xvec[i]);
- }
- refcount_set(&sp->refcnt, 1);
return sp;
}
-EXPORT_SYMBOL(secpath_dup);
-
-int secpath_set(struct sk_buff *skb)
-{
- struct sec_path *sp;
-
- /* Allocate new secpath or COW existing one. */
- if (!skb->sp || refcount_read(&skb->sp->refcnt) != 1) {
- sp = secpath_dup(skb->sp);
- if (!sp)
- return -ENOMEM;
-
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
- }
- return 0;
-}
EXPORT_SYMBOL(secpath_set);
/* Fetch spi and seq from ipsec header */
@@ -236,6 +202,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
bool xfrm_gro = false;
bool crypto_done = false;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (encap_type < 0) {
x = xfrm_input_state(skb);
@@ -312,8 +279,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
break;
}
- err = secpath_set(skb);
- if (err) {
+ sp = secpath_set(skb);
+ if (!sp) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
goto drop;
}
@@ -328,7 +295,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
daddr = (xfrm_address_t *)(skb_network_header(skb) +
XFRM_SPI_SKB_CB(skb)->daddroff);
do {
- if (skb->sp->len == XFRM_MAX_DEPTH) {
+ sp = skb_sec_path(skb);
+
+ if (sp->len == XFRM_MAX_DEPTH) {
secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
@@ -344,7 +313,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
skb->mark = xfrm_smark_get(skb->mark, x);
- skb->sp->xvec[skb->sp->len++] = x;
+ sp->xvec[sp->len++] = x;
+
+ skb_dst_force(skb);
+ if (!skb_dst(skb)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+ goto drop;
+ }
lock:
spin_lock(&x->lock);
@@ -385,7 +360,6 @@ lock:
XFRM_SKB_CB(skb)->seq.input.low = seq;
XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
- skb_dst_force(skb);
dev_hold(skb->dev);
if (crypto_done)
@@ -468,8 +442,9 @@ resume:
nf_reset(skb);
if (decaps) {
- if (skb->sp)
- skb->sp->olen = 0;
+ sp = skb_sec_path(skb);
+ if (sp)
+ sp->olen = 0;
skb_dst_drop(skb);
gro_cells_receive(&gro_cells, skb);
return 0;
@@ -480,8 +455,9 @@ resume:
err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async);
if (xfrm_gro) {
- if (skb->sp)
- skb->sp->olen = 0;
+ sp = skb_sec_path(skb);
+ if (sp)
+ sp->olen = 0;
skb_dst_drop(skb);
gro_cells_receive(&gro_cells, skb);
return err;
@@ -546,11 +522,6 @@ void __init xfrm_input_init(void)
if (err)
gro_cells.cells = NULL;
- secpath_cachep = kmem_cache_create("secpath_cache",
- sizeof(struct sec_path),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL);
-
for_each_possible_cpu(i) {
struct xfrm_trans_tasklet *trans;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index d679fa0f44b3..6be8c7df15bb 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -251,7 +251,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
struct xfrm_if *xi;
bool xnet;
- if (err && !skb->sp)
+ if (err && !secpath_exists(skb))
return 0;
x = xfrm_input_state(skb);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 4ae87c5ce2e3..9333153bafda 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -102,6 +102,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
skb_dst_force(skb);
if (!skb_dst(skb)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ err = -EHOSTUNREACH;
goto error_nolock;
}
@@ -218,19 +219,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
if (xfrm_dev_offload_ok(skb, x)) {
struct sec_path *sp;
- sp = secpath_dup(skb->sp);
+ sp = secpath_set(skb);
if (!sp) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
kfree_skb(skb);
return -ENOMEM;
}
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
skb->encapsulation = 1;
sp->olen++;
- sp->xvec[skb->sp->len++] = x;
+ sp->xvec[sp->len++] = x;
xfrm_state_hold(x);
if (skb_is_gso(skb)) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 119a427d9b2b..934492bad8e0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -26,6 +26,7 @@
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
+#include <linux/rhashtable.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/xfrm.h>
@@ -45,6 +46,99 @@ struct xfrm_flo {
u8 flags;
};
+/* prefixes smaller than this are stored in lists, not trees. */
+#define INEXACT_PREFIXLEN_IPV4 16
+#define INEXACT_PREFIXLEN_IPV6 48
+
+struct xfrm_pol_inexact_node {
+ struct rb_node node;
+ union {
+ xfrm_address_t addr;
+ struct rcu_head rcu;
+ };
+ u8 prefixlen;
+
+ struct rb_root root;
+
+ /* the policies matching this node, can be empty list */
+ struct hlist_head hhead;
+};
+
+/* xfrm inexact policy search tree:
+ * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
+ * |
+ * +---- root_d: sorted by daddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | +- root: sorted by saddr/prefix
+ * | | |
+ * | | xfrm_pol_inexact_node
+ * | | |
+ * | | + root: unused
+ * | | |
+ * | | + hhead: saddr:daddr policies
+ * | |
+ * | +- coarse policies and all any:daddr policies
+ * |
+ * +---- root_s: sorted by saddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | + root: unused
+ * | |
+ * | + hhead: saddr:any policies
+ * |
+ * +---- coarse policies and all any:any policies
+ *
+ * Lookups return four candidate lists:
+ * 1. any:any list from top-level xfrm_pol_inexact_bin
+ * 2. any:daddr list from daddr tree
+ * 3. saddr:daddr list from 2nd level daddr tree
+ * 4. saddr:any list from saddr tree
+ *
+ * This result set then needs to be searched for the policy with
+ * the lowest priority. If two results have same prio, youngest one wins.
+ */
+
+struct xfrm_pol_inexact_key {
+ possible_net_t net;
+ u32 if_id;
+ u16 family;
+ u8 dir, type;
+};
+
+struct xfrm_pol_inexact_bin {
+ struct xfrm_pol_inexact_key k;
+ struct rhash_head head;
+ /* list containing '*:*' policies */
+ struct hlist_head hhead;
+
+ seqcount_t count;
+ /* tree sorted by daddr/prefix */
+ struct rb_root root_d;
+
+ /* tree sorted by saddr/prefix */
+ struct rb_root root_s;
+
+ /* slow path below */
+ struct list_head inexact_bins;
+ struct rcu_head rcu;
+};
+
+enum xfrm_pol_inexact_candidate_type {
+ XFRM_POL_CAND_BOTH,
+ XFRM_POL_CAND_SADDR,
+ XFRM_POL_CAND_DADDR,
+ XFRM_POL_CAND_ANY,
+
+ XFRM_POL_CAND_MAX,
+};
+
+struct xfrm_pol_inexact_candidates {
+ struct hlist_head *res[XFRM_POL_CAND_MAX];
+};
+
static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
@@ -55,6 +149,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
static struct kmem_cache *xfrm_dst_cache __ro_after_init;
static __read_mostly seqcount_t xfrm_policy_hash_generation;
+static struct rhashtable xfrm_policy_inexact_table;
+static const struct rhashtable_params xfrm_pol_inexact_params;
+
static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
@@ -64,6 +161,25 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
+ u32 if_id);
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net,
+ u8 type, u16 family, u8 dir, u32 if_id);
+static struct xfrm_policy *
+xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
+ bool excl);
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy);
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr);
+
static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
return refcount_inc_not_zero(&policy->refcnt);
@@ -269,6 +385,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
if (policy) {
write_pnet(&policy->xp_net, net);
INIT_LIST_HEAD(&policy->walk.all);
+ INIT_HLIST_NODE(&policy->bydst_inexact_list);
INIT_HLIST_NODE(&policy->bydst);
INIT_HLIST_NODE(&policy->byidx);
rwlock_init(&policy->lock);
@@ -365,7 +482,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
hash = __sel_hash(sel, family, hmask, dbits, sbits);
if (hash == hmask + 1)
- return &net->xfrm.policy_inexact[dir];
+ return NULL;
return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
@@ -563,6 +680,533 @@ static void xfrm_hash_resize(struct work_struct *work)
mutex_unlock(&hash_resize_mutex);
}
+static void xfrm_hash_reset_inexact_table(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *b;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins)
+ INIT_HLIST_HEAD(&b->hhead);
+}
+
+/* Make sure *pol can be inserted into fastbin.
+ * Useful to check that later insert requests will be sucessful
+ * (provided xfrm_policy_lock is held throughout).
+ */
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
+{
+ struct xfrm_pol_inexact_bin *bin, *prev;
+ struct xfrm_pol_inexact_key k = {
+ .family = pol->family,
+ .type = pol->type,
+ .dir = dir,
+ .if_id = pol->if_id,
+ };
+ struct net *net = xp_net(pol);
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ write_pnet(&k.net, net);
+ bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+ if (bin)
+ return bin;
+
+ bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
+ if (!bin)
+ return NULL;
+
+ bin->k = k;
+ INIT_HLIST_HEAD(&bin->hhead);
+ bin->root_d = RB_ROOT;
+ bin->root_s = RB_ROOT;
+ seqcount_init(&bin->count);
+
+ prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
+ &bin->k, &bin->head,
+ xfrm_pol_inexact_params);
+ if (!prev) {
+ list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
+ return bin;
+ }
+
+ kfree(bin);
+
+ return IS_ERR(prev) ? NULL : prev;
+}
+
+static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
+ int family, u8 prefixlen)
+{
+ if (xfrm_addr_any(addr, family))
+ return true;
+
+ if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
+ return true;
+
+ if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
+ return true;
+
+ return false;
+}
+
+static bool
+xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
+{
+ const xfrm_address_t *addr;
+ bool saddr_any, daddr_any;
+ u8 prefixlen;
+
+ addr = &policy->selector.saddr;
+ prefixlen = policy->selector.prefixlen_s;
+
+ saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ addr = &policy->selector.daddr;
+ prefixlen = policy->selector.prefixlen_d;
+ daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ return saddr_any && daddr_any;
+}
+
+static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
+ const xfrm_address_t *addr, u8 prefixlen)
+{
+ node->addr = *addr;
+ node->prefixlen = prefixlen;
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
+{
+ struct xfrm_pol_inexact_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
+ if (node)
+ xfrm_pol_inexact_node_init(node, addr, prefixlen);
+
+ return node;
+}
+
+static int xfrm_policy_addr_delta(const xfrm_address_t *a,
+ const xfrm_address_t *b,
+ u8 prefixlen, u16 family)
+{
+ unsigned int pdw, pbi;
+ int delta = 0;
+
+ switch (family) {
+ case AF_INET:
+ if (sizeof(long) == 4 && prefixlen == 0)
+ return ntohl(a->a4) - ntohl(b->a4);
+ return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) -
+ (ntohl(b->a4) & ((~0UL << (32 - prefixlen))));
+ case AF_INET6:
+ pdw = prefixlen >> 5;
+ pbi = prefixlen & 0x1f;
+
+ if (pdw) {
+ delta = memcmp(a->a6, b->a6, pdw << 2);
+ if (delta)
+ return delta;
+ }
+ if (pbi) {
+ u32 mask = ~0u << (32 - pbi);
+
+ delta = (ntohl(a->a6[pdw]) & mask) -
+ (ntohl(b->a6[pdw]) & mask);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return delta;
+}
+
+static void xfrm_policy_inexact_list_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ unsigned int matched_s, matched_d;
+ struct hlist_node *newpos = NULL;
+ struct xfrm_policy *policy, *p;
+
+ matched_s = 0;
+ matched_d = 0;
+
+ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ bool matches_s, matches_d;
+
+ if (!policy->bydst_reinsert)
+ continue;
+
+ WARN_ON_ONCE(policy->family != family);
+
+ policy->bydst_reinsert = false;
+ hlist_for_each_entry(p, &n->hhead, bydst) {
+ if (policy->priority >= p->priority)
+ newpos = &p->bydst;
+ else
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind(&policy->bydst, newpos);
+ else
+ hlist_add_head(&policy->bydst, &n->hhead);
+
+ /* paranoia checks follow.
+ * Check that the reinserted policy matches at least
+ * saddr or daddr for current node prefix.
+ *
+ * Matching both is fine, matching saddr in one policy
+ * (but not daddr) and then matching only daddr in another
+ * is a bug.
+ */
+ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ if (matches_s && matches_d)
+ continue;
+
+ WARN_ON_ONCE(!matches_s && !matches_d);
+ if (matches_s)
+ matched_s++;
+ if (matches_d)
+ matched_d++;
+ WARN_ON_ONCE(matched_s && matched_d);
+ }
+}
+
+static void xfrm_policy_inexact_node_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ struct rb_root *new,
+ u16 family)
+{
+ struct rb_node **p, *parent = NULL;
+ struct xfrm_pol_inexact_node *node;
+
+ /* we should not have another subtree here */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
+
+ p = &new->rb_node;
+ while (*p) {
+ u8 prefixlen;
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ prefixlen = min(node->prefixlen, n->prefixlen);
+
+ delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
+ prefixlen, family);
+ if (delta < 0) {
+ p = &parent->rb_left;
+ } else if (delta > 0) {
+ p = &parent->rb_right;
+ } else {
+ struct xfrm_policy *tmp;
+
+ hlist_for_each_entry(tmp, &node->hhead, bydst)
+ tmp->bydst_reinsert = true;
+ hlist_for_each_entry(tmp, &n->hhead, bydst)
+ tmp->bydst_reinsert = true;
+
+ INIT_HLIST_HEAD(&node->hhead);
+ xfrm_policy_inexact_list_reinsert(net, node, family);
+
+ if (node->prefixlen == n->prefixlen) {
+ kfree_rcu(n, rcu);
+ return;
+ }
+
+ rb_erase(*p, new);
+ kfree_rcu(n, rcu);
+ n = node;
+ n->prefixlen = prefixlen;
+ *p = new->rb_node;
+ parent = NULL;
+ }
+ }
+
+ rb_link_node_rcu(&n->node, parent, p);
+ rb_insert_color(&n->node, new);
+}
+
+/* merge nodes v and n */
+static void xfrm_policy_inexact_node_merge(struct net *net,
+ struct xfrm_pol_inexact_node *v,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct xfrm_policy *tmp;
+ struct rb_node *rnode;
+
+ /* To-be-merged node v has a subtree.
+ *
+ * Dismantle it and insert its nodes to n->root.
+ */
+ while ((rnode = rb_first(&v->root)) != NULL) {
+ node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
+ rb_erase(&node->node, &v->root);
+ xfrm_policy_inexact_node_reinsert(net, node, &n->root,
+ family);
+ }
+
+ hlist_for_each_entry(tmp, &v->hhead, bydst)
+ tmp->bydst_reinsert = true;
+ hlist_for_each_entry(tmp, &n->hhead, bydst)
+ tmp->bydst_reinsert = true;
+
+ INIT_HLIST_HEAD(&n->hhead);
+ xfrm_policy_inexact_list_reinsert(net, n, family);
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_policy_inexact_insert_node(struct net *net,
+ struct rb_root *root,
+ xfrm_address_t *addr,
+ u16 family, u8 prefixlen, u8 dir)
+{
+ struct xfrm_pol_inexact_node *cached = NULL;
+ struct rb_node **p, *parent = NULL;
+ struct xfrm_pol_inexact_node *node;
+
+ p = &root->rb_node;
+ while (*p) {
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen,
+ family);
+ if (delta == 0 && prefixlen >= node->prefixlen) {
+ WARN_ON_ONCE(cached); /* ipsec policies got lost */
+ return node;
+ }
+
+ if (delta < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ if (prefixlen < node->prefixlen) {
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ prefixlen,
+ family);
+ if (delta)
+ continue;
+
+ /* This node is a subnet of the new prefix. It needs
+ * to be removed and re-inserted with the smaller
+ * prefix and all nodes that are now also covered
+ * by the reduced prefixlen.
+ */
+ rb_erase(&node->node, root);
+
+ if (!cached) {
+ xfrm_pol_inexact_node_init(node, addr,
+ prefixlen);
+ cached = node;
+ } else {
+ /* This node also falls within the new
+ * prefixlen. Merge the to-be-reinserted
+ * node and this one.
+ */
+ xfrm_policy_inexact_node_merge(net, node,
+ cached, family);
+ kfree_rcu(node, rcu);
+ }
+
+ /* restart */
+ p = &root->rb_node;
+ parent = NULL;
+ }
+ }
+
+ node = cached;
+ if (!node) {
+ node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
+ if (!node)
+ return NULL;
+ }
+
+ rb_link_node_rcu(&node->node, parent, p);
+ rb_insert_color(&node->node, root);
+
+ return node;
+}
+
+static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct rb_node *rn = rb_first(r);
+
+ while (rn) {
+ node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
+
+ xfrm_policy_inexact_gc_tree(&node->root, rm);
+ rn = rb_next(rn);
+
+ if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
+ WARN_ON_ONCE(rm);
+ continue;
+ }
+
+ rb_erase(&node->node, r);
+ kfree_rcu(node, rcu);
+ }
+}
+
+static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
+{
+ write_seqcount_begin(&b->count);
+ xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
+ xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
+ write_seqcount_end(&b->count);
+
+ if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
+ !hlist_empty(&b->hhead)) {
+ WARN_ON_ONCE(net_exit);
+ return;
+ }
+
+ if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
+ xfrm_pol_inexact_params) == 0) {
+ list_del(&b->inexact_bins);
+ kfree_rcu(b, rcu);
+ }
+}
+
+static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
+{
+ struct net *net = read_pnet(&b->k.net);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ __xfrm_policy_inexact_prune_bin(b, false);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+
+static void __xfrm_policy_inexact_flush(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *bin, *t;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+}
+
+static struct hlist_head *
+xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
+ struct xfrm_policy *policy, u8 dir)
+{
+ struct xfrm_pol_inexact_node *n;
+ struct net *net;
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ if (xfrm_policy_inexact_insert_use_any_list(policy))
+ return &bin->hhead;
+
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d)) {
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_s,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s,
+ dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+ }
+
+ /* daddr is fixed */
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_d,
+ &policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ /* saddr is wildcard */
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s))
+ return &n->hhead;
+
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &n->root,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+}
+
+static struct xfrm_policy *
+xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
+{
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+ struct net *net;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ return ERR_PTR(-ENOMEM);
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
+ if (!chain) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ if (delpol && excl) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-EEXIST);
+ }
+
+ chain = &net->xfrm.policy_inexact[dir];
+ xfrm_policy_insert_inexact_list(chain, policy);
+
+ if (delpol)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+
+ return delpol;
+}
+
static void xfrm_hash_rebuild(struct work_struct *work)
{
struct net *net = container_of(work, struct net,
@@ -592,7 +1236,50 @@ static void xfrm_hash_rebuild(struct work_struct *work)
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ /* make sure that we can insert the indirect policies again before
+ * we start with destructive action.
+ */
+ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
+ struct xfrm_pol_inexact_bin *bin;
+ u8 dbits, sbits;
+
+ dir = xfrm_policy_id2dir(policy->index);
+ if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
+ continue;
+
+ if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+ if (policy->family == AF_INET) {
+ dbits = rbits4;
+ sbits = lbits4;
+ } else {
+ dbits = rbits6;
+ sbits = lbits6;
+ }
+ } else {
+ if (policy->family == AF_INET) {
+ dbits = lbits4;
+ sbits = rbits4;
+ } else {
+ dbits = lbits6;
+ sbits = rbits6;
+ }
+ }
+
+ if (policy->selector.prefixlen_d < dbits ||
+ policy->selector.prefixlen_s < sbits)
+ continue;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ goto out_unlock;
+
+ if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
+ goto out_unlock;
+ }
+
/* reset the bydst and inexact table in all directions */
+ xfrm_hash_reset_inexact_table(net);
+
for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
hmask = net->xfrm.policy_bydst[dir].hmask;
@@ -616,15 +1303,23 @@ static void xfrm_hash_rebuild(struct work_struct *work)
/* re-insert all policies by order of creation */
list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
- if (policy->walk.dead ||
- xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+ if (policy->walk.dead)
+ continue;
+ dir = xfrm_policy_id2dir(policy->index);
+ if (dir >= XFRM_POLICY_MAX) {
/* skip socket policies */
continue;
}
newpos = NULL;
chain = policy_hash_bysel(net, &policy->selector,
- policy->family,
- xfrm_policy_id2dir(policy->index));
+ policy->family, dir);
+ if (!chain) {
+ void *p = xfrm_policy_inexact_insert(policy, dir, 0);
+
+ WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
+ continue;
+ }
+
hlist_for_each_entry(pol, chain, bydst) {
if (policy->priority >= pol->priority)
newpos = &pol->bydst;
@@ -637,6 +1332,8 @@ static void xfrm_hash_rebuild(struct work_struct *work)
hlist_add_head_rcu(&policy->bydst, chain);
}
+out_unlock:
+ __xfrm_policy_inexact_flush(net);
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
@@ -740,18 +1437,97 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
return false;
}
-int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
- struct net *net = xp_net(policy);
- struct xfrm_policy *pol;
- struct xfrm_policy *delpol;
- struct hlist_head *chain;
- struct hlist_node *newpos;
+ const struct xfrm_pol_inexact_key *k = data;
+ u32 a = k->type << 24 | k->dir << 16 | k->family;
+
+ return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
+ seed);
+}
+
+static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
+{
+ const struct xfrm_pol_inexact_bin *b = data;
+
+ return xfrm_pol_bin_key(&b->k, 0, seed);
+}
+
+static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xfrm_pol_inexact_key *key = arg->key;
+ const struct xfrm_pol_inexact_bin *b = ptr;
+ int ret;
+
+ if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
+ return -1;
+
+ ret = b->k.dir ^ key->dir;
+ if (ret)
+ return ret;
+
+ ret = b->k.type ^ key->type;
+ if (ret)
+ return ret;
+
+ ret = b->k.family ^ key->family;
+ if (ret)
+ return ret;
+
+ return b->k.if_id ^ key->if_id;
+}
+
+static const struct rhashtable_params xfrm_pol_inexact_params = {
+ .head_offset = offsetof(struct xfrm_pol_inexact_bin, head),
+ .hashfn = xfrm_pol_bin_key,
+ .obj_hashfn = xfrm_pol_bin_obj,
+ .obj_cmpfn = xfrm_pol_bin_cmp,
+ .automatic_shrinking = true,
+};
+
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy)
+{
+ struct xfrm_policy *pol, *delpol = NULL;
+ struct hlist_node *newpos = NULL;
+ int i = 0;
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
+ !selector_cmp(&pol->selector, &policy->selector) &&
+ xfrm_policy_mark_match(policy, pol) &&
+ xfrm_sec_ctx_match(pol->security, policy->security) &&
+ !WARN_ON(delpol)) {
+ delpol = pol;
+ if (policy->priority > pol->priority)
+ continue;
+ } else if (policy->priority >= pol->priority) {
+ newpos = &pol->bydst_inexact_list;
+ continue;
+ }
+ if (delpol)
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
+ else
+ hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ pol->pos = i;
+ i++;
+ }
+}
+
+static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
+ struct xfrm_policy *policy,
+ bool excl)
+{
+ struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
- delpol = NULL;
- newpos = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
pol->if_id == policy->if_id &&
@@ -759,24 +1535,45 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
- if (excl) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return -EEXIST;
- }
+ if (excl)
+ return ERR_PTR(-EEXIST);
delpol = pol;
if (policy->priority > pol->priority)
continue;
} else if (policy->priority >= pol->priority) {
- newpos = &pol->bydst;
+ newpos = pol;
continue;
}
if (delpol)
break;
}
+
if (newpos)
- hlist_add_behind_rcu(&policy->bydst, newpos);
+ hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
else
hlist_add_head_rcu(&policy->bydst, chain);
+
+ return delpol;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+ struct net *net = xp_net(policy);
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+ if (chain)
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ else
+ delpol = xfrm_policy_inexact_insert(policy, dir, excl);
+
+ if (IS_ERR(delpol)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return PTR_ERR(delpol);
+ }
+
__xfrm_policy_link(policy, dir);
/* After previous checking, family can either be AF_INET or AF_INET6 */
@@ -806,43 +1603,96 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
+static struct xfrm_policy *
+__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx)
+{
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (pol->type == type &&
+ pol->if_id == if_id &&
+ (mark & pol->mark.m) == pol->mark.v &&
+ !selector_cmp(sel, &pol->selector) &&
+ xfrm_sec_ctx_match(ctx, pol->security))
+ return pol;
+ }
+
+ return NULL;
+}
+
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
u8 type, int dir,
struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_bin *bin = NULL;
+ struct xfrm_policy *pol, *ret = NULL;
struct hlist_head *chain;
*err = 0;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
- ret = NULL;
- hlist_for_each_entry(pol, chain, bydst) {
- if (pol->type == type &&
- pol->if_id == if_id &&
- (mark & pol->mark.m) == pol->mark.v &&
- !selector_cmp(sel, &pol->selector) &&
- xfrm_sec_ctx_match(ctx, pol->security)) {
- xfrm_pol_hold(pol);
- if (delete) {
- *err = security_xfrm_policy_delete(
- pol->security);
- if (*err) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return pol;
- }
- __xfrm_policy_unlink(pol, dir);
+ if (!chain) {
+ struct xfrm_pol_inexact_candidates cand;
+ int i;
+
+ bin = xfrm_policy_inexact_lookup(net, type,
+ sel->family, dir, if_id);
+ if (!bin) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
+ }
+
+ if (!xfrm_policy_find_inexact_candidates(&cand, bin,
+ &sel->saddr,
+ &sel->daddr)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
+ }
+
+ pol = NULL;
+ for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
+ struct xfrm_policy *tmp;
+
+ tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
+ if_id, type, dir,
+ sel, ctx);
+ if (!tmp)
+ continue;
+
+ if (!pol || tmp->pos < pol->pos)
+ pol = tmp;
+ }
+ } else {
+ pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
+ sel, ctx);
+ }
+
+ if (pol) {
+ xfrm_pol_hold(pol);
+ if (delete) {
+ *err = security_xfrm_policy_delete(pol->security);
+ if (*err) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
}
- ret = pol;
- break;
+ __xfrm_policy_unlink(pol, dir);
}
+ ret = pol;
}
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
+ if (bin && delete)
+ xfrm_policy_inexact_prune_bin(bin);
return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -892,36 +1742,19 @@ EXPORT_SYMBOL(xfrm_policy_byid);
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
- int dir, err = 0;
+ struct xfrm_policy *pol;
+ int err = 0;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ if (pol->walk.dead ||
+ xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0, task_valid);
- return err;
- }
- }
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(
- pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0,
- task_valid);
- return err;
- }
- }
+ err = security_xfrm_policy_delete(pol->security);
+ if (err) {
+ xfrm_audit_policy_delete(pol, 0, task_valid);
+ return err;
}
}
return err;
@@ -937,6 +1770,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
int dir, err = 0, cnt = 0;
+ struct xfrm_policy *pol;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
@@ -944,48 +1778,25 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (err)
goto out;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
-
- again1:
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
-
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again1;
- }
-
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- again2:
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again2;
- }
- }
+again:
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ dir = xfrm_policy_id2dir(pol->index);
+ if (pol->walk.dead ||
+ dir >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
+ __xfrm_policy_unlink(pol, dir);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ cnt++;
+ xfrm_audit_policy_delete(pol, 1, task_valid);
+ xfrm_policy_kill(pol);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ goto again;
}
- if (!cnt)
+ if (cnt)
+ __xfrm_policy_inexact_flush(net);
+ else
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -1084,21 +1895,188 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
if (match)
ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
dir);
-
return ret;
}
+static struct xfrm_pol_inexact_node *
+xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
+ seqcount_t *count,
+ const xfrm_address_t *addr, u16 family)
+{
+ const struct rb_node *parent;
+ int seq;
+
+again:
+ seq = read_seqcount_begin(count);
+
+ parent = rcu_dereference_raw(r->rb_node);
+ while (parent) {
+ struct xfrm_pol_inexact_node *node;
+ int delta;
+
+ node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen, family);
+ if (delta < 0) {
+ parent = rcu_dereference_raw(parent->rb_left);
+ continue;
+ } else if (delta > 0) {
+ parent = rcu_dereference_raw(parent->rb_right);
+ continue;
+ }
+
+ return node;
+ }
+
+ if (read_seqcount_retry(count, seq))
+ goto again;
+
+ return NULL;
+}
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct xfrm_pol_inexact_node *n;
+ u16 family;
+
+ if (!b)
+ return false;
+
+ family = b->k.family;
+ memset(cand, 0, sizeof(*cand));
+ cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
+ family);
+ if (n) {
+ cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
+ n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
+ }
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
+
+ return true;
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_key k = {
+ .family = family,
+ .type = type,
+ .dir = dir,
+ .if_id = if_id,
+ };
+
+ write_pnet(&k.net, net);
+
+ return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_bin *bin;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ rcu_read_lock();
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ rcu_read_unlock();
+
+ return bin;
+}
+
+static struct xfrm_policy *
+__xfrm_policy_eval_candidates(struct hlist_head *chain,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ u32 priority = prefer ? prefer->priority : ~0u;
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ int err;
+
+ if (pol->priority > priority)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+ if (err) {
+ if (err != -ESRCH)
+ return ERR_PTR(err);
+
+ continue;
+ }
+
+ if (prefer) {
+ /* matches. Is it older than *prefer? */
+ if (pol->priority == priority &&
+ prefer->pos < pol->pos)
+ return prefer;
+ }
+
+ return pol;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_policy *
+xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ struct xfrm_policy *tmp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
+ tmp = __xfrm_policy_eval_candidates(cand->res[i],
+ prefer,
+ fl, type, family, dir,
+ if_id);
+ if (!tmp)
+ continue;
+
+ if (IS_ERR(tmp))
+ return tmp;
+ prefer = tmp;
+ }
+
+ return prefer;
+}
+
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
const struct flowi *fl,
u16 family, u8 dir,
u32 if_id)
{
- int err;
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_candidates cand;
const xfrm_address_t *daddr, *saddr;
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
unsigned int sequence;
- u32 priority;
+ int err;
daddr = xfrm_flowi_daddr(fl, family);
saddr = xfrm_flowi_saddr(fl, family);
@@ -1112,7 +2090,6 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
chain = policy_hash_direct(net, daddr, saddr, family, dir);
} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
- priority = ~0U;
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
@@ -1125,29 +2102,23 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
} else {
ret = pol;
- priority = ret->priority;
break;
}
}
- chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry_rcu(pol, chain, bydst) {
- if ((pol->priority >= priority) && ret)
- break;
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
+ daddr))
+ goto skip_inexact;
- err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
- if (err) {
- if (err == -ESRCH)
- continue;
- else {
- ret = ERR_PTR(err);
- goto fail;
- }
- } else {
- ret = pol;
- break;
- }
+ pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
+ family, dir, if_id);
+ if (pol) {
+ ret = pol;
+ if (IS_ERR(pol))
+ goto fail;
}
+skip_inexact:
if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
goto retry;
@@ -1239,6 +2210,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
/* Socket policies are not hashed. */
if (!hlist_unhashed(&pol->bydst)) {
hlist_del_rcu(&pol->bydst);
+ hlist_del_init(&pol->bydst_inexact_list);
hlist_del(&pol->byidx);
}
@@ -1811,7 +2783,7 @@ static void xfrm_policy_queue_process(struct timer_list *t)
pq->timeout = pq->timeout << 1;
if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
xfrm_pol_hold(pol);
- goto out;
+ goto out;
}
dst_release(dst);
@@ -2225,11 +3197,12 @@ EXPORT_SYMBOL(xfrm_lookup_route);
static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
+ struct sec_path *sp = skb_sec_path(skb);
struct xfrm_state *x;
- if (!skb->sp || idx < 0 || idx >= skb->sp->len)
+ if (!sp || idx < 0 || idx >= sp->len)
return 0;
- x = skb->sp->xvec[idx];
+ x = sp->xvec[idx];
if (!x->type->reject)
return 0;
return x->type->reject(x, skb, fl);
@@ -2329,6 +3302,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
struct flowi fl;
int xerr_idx = -1;
const struct xfrm_if_cb *ifcb;
+ struct sec_path *sp;
struct xfrm_if *xi;
u32 if_id = 0;
@@ -2353,11 +3327,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
nf_nat_decode_session(skb, &fl, family);
/* First, check used SA against their selectors. */
- if (skb->sp) {
+ sp = skb_sec_path(skb);
+ if (sp) {
int i;
- for (i = skb->sp->len-1; i >= 0; i--) {
- struct xfrm_state *x = skb->sp->xvec[i];
+ for (i = sp->len - 1; i >= 0; i--) {
+ struct xfrm_state *x = sp->xvec[i];
if (!xfrm_selector_match(&x->sel, &fl, family)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
return 0;
@@ -2384,7 +3359,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
if (!pol) {
- if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
+ if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
xfrm_secpath_reject(xerr_idx, skb, &fl);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
return 0;
@@ -2413,7 +3388,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
#endif
if (pol->action == XFRM_POLICY_ALLOW) {
- struct sec_path *sp;
static struct sec_path dummy;
struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
@@ -2421,7 +3395,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
int ti = 0;
int i, k;
- if ((sp = skb->sp) == NULL)
+ sp = skb_sec_path(skb);
+ if (!sp)
sp = &dummy;
for (pi = 0; pi < npols; pi++) {
@@ -2816,13 +3791,17 @@ static void xfrm_statistics_fini(struct net *net)
static int __net_init xfrm_policy_init(struct net *net)
{
unsigned int hmask, sz;
- int dir;
+ int dir, err;
- if (net_eq(net, &init_net))
+ if (net_eq(net, &init_net)) {
xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
sizeof(struct xfrm_dst),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ err = rhashtable_init(&xfrm_policy_inexact_table,
+ &xfrm_pol_inexact_params);
+ BUG_ON(err);
+ }
hmask = 8 - 1;
sz = (hmask+1) * sizeof(struct hlist_head);
@@ -2857,6 +3836,7 @@ static int __net_init xfrm_policy_init(struct net *net)
seqlock_init(&net->xfrm.policy_hthresh.lock);
INIT_LIST_HEAD(&net->xfrm.policy_all);
+ INIT_LIST_HEAD(&net->xfrm.inexact_bins);
INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
return 0;
@@ -2875,6 +3855,7 @@ out_byidx:
static void xfrm_policy_fini(struct net *net)
{
+ struct xfrm_pol_inexact_bin *b, *t;
unsigned int sz;
int dir;
@@ -2900,6 +3881,11 @@ static void xfrm_policy_fini(struct net *net)
sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(b, true);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
static int __net_init xfrm_net_init(struct net *net)
@@ -3065,7 +4051,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
}
}
chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
if ((pol->priority >= priority) && ret)
break;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index dc4a9f1fb941..23c92891758a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -426,6 +426,12 @@ static void xfrm_put_mode(struct xfrm_mode *mode)
module_put(mode->owner);
}
+void xfrm_state_free(struct xfrm_state *x)
+{
+ kmem_cache_free(xfrm_state_cache, x);
+}
+EXPORT_SYMBOL(xfrm_state_free);
+
static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
tasklet_hrtimer_cancel(&x->mtimer);
@@ -452,7 +458,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
}
xfrm_dev_state_free(x);
security_xfrm_state_free(x);
- kmem_cache_free(xfrm_state_cache, x);
+ xfrm_state_free(x);
}
static void xfrm_state_gc_task(struct work_struct *work)
@@ -788,7 +794,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
{
spin_lock_bh(&net->xfrm.xfrm_state_lock);
si->sadcnt = net->xfrm.state_num;
- si->sadhcnt = net->xfrm.state_hmask;
+ si->sadhcnt = net->xfrm.state_hmask + 1;
si->sadhmcnt = xfrm_state_hashmax;
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c9a84e22f5d5..277c1c46fe94 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2288,13 +2288,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
}
- kfree(x);
+ xfrm_state_free(x);
kfree(xp);
return 0;
free_state:
- kfree(x);
+ xfrm_state_free(x);
nomem:
return err;
}