diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 05:02:57 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 05:02:57 +0100 |
commit | 386403a115f95997c2715691226e11a7b5cffcfd (patch) | |
tree | a685df70bd3d5b295683713818ddf0752c3d75b6 /net | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6 (diff) | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next (diff) | |
download | linux-386403a115f95997c2715691226e11a7b5cffcfd.tar.xz linux-386403a115f95997c2715691226e11a7b5cffcfd.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
"Another merge window, another pull full of stuff:
1) Support alternative names for network devices, from Jiri Pirko.
2) Introduce per-netns netdev notifiers, also from Jiri Pirko.
3) Support MSG_PEEK in vsock/virtio, from Matias Ezequiel Vara
Larsen.
4) Allow compiling out the TLS TOE code, from Jakub Kicinski.
5) Add several new tracepoints to the kTLS code, also from Jakub.
6) Support set channels ethtool callback in ena driver, from Sameeh
Jubran.
7) New SCTP events SCTP_ADDR_ADDED, SCTP_ADDR_REMOVED,
SCTP_ADDR_MADE_PRIM, and SCTP_SEND_FAILED_EVENT. From Xin Long.
8) Add XDP support to mvneta driver, from Lorenzo Bianconi.
9) Lots of netfilter hw offload fixes, cleanups and enhancements,
from Pablo Neira Ayuso.
10) PTP support for aquantia chips, from Egor Pomozov.
11) Add UDP segmentation offload support to igb, ixgbe, and i40e. From
Josh Hunt.
12) Add smart nagle to tipc, from Jon Maloy.
13) Support L2 field rewrite by TC offloads in bnxt_en, from Venkat
Duvvuru.
14) Add a flow mask cache to OVS, from Tonghao Zhang.
15) Add XDP support to ice driver, from Maciej Fijalkowski.
16) Add AF_XDP support to ice driver, from Krzysztof Kazimierczak.
17) Support UDP GSO offload in atlantic driver, from Igor Russkikh.
18) Support it in stmmac driver too, from Jose Abreu.
19) Support TIPC encryption and auth, from Tuong Lien.
20) Introduce BPF trampolines, from Alexei Starovoitov.
21) Make page_pool API more numa friendly, from Saeed Mahameed.
22) Introduce route hints to ipv4 and ipv6, from Paolo Abeni.
23) Add UDP segmentation offload to cxgb4, Rahul Lakkireddy"
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1857 commits)
libbpf: Fix usage of u32 in userspace code
mm: Implement no-MMU variant of vmalloc_user_node_flags
slip: Fix use-after-free Read in slip_open
net: dsa: sja1105: fix sja1105_parse_rgmii_delays()
macvlan: schedule bc_work even if error
enetc: add support Credit Based Shaper(CBS) for hardware offload
net: phy: add helpers phy_(un)lock_mdio_bus
mdio_bus: don't use managed reset-controller
ax88179_178a: add ethtool_op_get_ts_info()
mlxsw: spectrum_router: Fix use of uninitialized adjacency index
mlxsw: spectrum_router: After underlay moves, demote conflicting tunnels
bpf: Simplify __bpf_arch_text_poke poke type handling
bpf: Introduce BPF_TRACE_x helper for the tracing tests
bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT
bpf, testing: Add various tail call test cases
bpf, x86: Emit patchable direct jump as tail call
bpf: Constant map key tracking for prog array pokes
bpf: Add poke dependency tracking for prog array maps
bpf: Add initial poke descriptor table for jit images
bpf: Move owner type, jited info into array auxiliary data
...
Diffstat (limited to 'net')
297 files changed, 13224 insertions, 3695 deletions
diff --git a/net/Kconfig b/net/Kconfig index 3101bfcbdd7a..bd191f978a23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -258,7 +258,7 @@ config XPS default y config HWBM - bool + bool config CGROUP_NET_PRIO bool "Network priority cgroup" @@ -309,12 +309,12 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG ---help--- - Enabling this allows a stream parser to be used with - BPF_MAP_TYPE_SOCKMAP. + Enabling this allows a stream parser to be used with + BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. + BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. + It can be used to enforce socket policy, implement socket redirects, + etc. config NET_FLOW_LIMIT bool @@ -349,12 +349,12 @@ config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS ---help--- - This feature provides an alerting service to userspace in the - event that packets are discarded in the network stack. Alerts - are broadcast via netlink socket to any listening user space - process. If you don't need network drop alerts, or if you are ok - just checking the various proc files and other utilities for - drop statistics, say N here. + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. endmenu @@ -433,7 +433,7 @@ config NET_DEVLINK imply NET_DROP_MONITOR config PAGE_POOL - bool + bool config FAILOVER tristate "Generic failover module" diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 6c11cdf4dd4c..fbd0c5e7b299 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) dev_kfree_skb(skb); goto as_indicate_complete; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk)); sk->sk_state_change(sk); diff --git a/net/atm/svc.c b/net/atm/svc.c index 908cbb8654f5..ba144d035e3d 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -381,7 +381,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index bb222b882b67..324306d6fde0 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1384,7 +1384,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; out: diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index dcdbaeeb2358..cd6afe895db9 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -356,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, make->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); bh_unlock_sock(sk); } else { if (!mine) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 64054edc2e3c..4ff6cf1ecae7 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1085,7 +1085,6 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); - spin_lock_init(&hard_iface->bat_v.aggr_list_lock); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8033f24f506c..714ce56cfcc8 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -152,7 +152,7 @@ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ @@ -163,7 +163,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } @@ -174,17 +174,13 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * * Empties the OGMv2 aggregation queue and frees all the skbs it contained. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { - struct sk_buff *skb; - - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); - - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) - kfree_skb(skb); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); + __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } @@ -197,7 +193,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) * * The aggregation queue is empty after this call. * - * Caller needs to hold the hard_iface->bat_v.aggr_list_lock. + * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { @@ -206,7 +202,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) unsigned int ogm_len; struct sk_buff *skb; - lockdep_assert_held(&hard_iface->bat_v.aggr_list_lock); + lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; @@ -220,7 +216,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); - while ((skb = skb_dequeue(&hard_iface->bat_v.aggr_list))) { + while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); @@ -247,13 +243,13 @@ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, return; } - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); - skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** @@ -392,9 +388,9 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } @@ -425,9 +421,9 @@ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); - spin_lock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); - spin_unlock_bh(&hard_iface->bat_v.aggr_list_lock); + spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 6967f2e4c3f4..c7b340ddd0e7 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.4" +#define BATADV_SOURCE_VERSION "2019.5" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d5bdf3a4b65..f9ec8e7507b6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1421,7 +1421,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (*orig) return BATADV_FORW_SINGLE; - /* fall through */ + fallthrough; case 0: return BATADV_FORW_NONE; default: diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5ee8e9a100f9..832e156c519e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> @@ -230,7 +229,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, break; } - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } @@ -455,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface, if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; - /* fall through */ + fallthrough; case ETH_P_BATMAN: goto dropped; } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4d7f1baee7b7..47718a82eaf2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -130,9 +130,6 @@ struct batadv_hard_iface_bat_v { /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */ unsigned int aggr_len; - /** @aggr_list_lock: protects aggr_list */ - spinlock_t aggr_list_lock; - /** * @throughput_override: throughput override to disable link * auto-detection diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 5f508c50649d..3fd124927d4d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -173,7 +173,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else release_sock(sk); - parent->sk_ack_backlog++; + sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -185,7 +185,7 @@ void bt_accept_unlink(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); - bt_sk(sk)->parent->sk_ack_backlog--; + sk_acceptq_removed(bt_sk(sk)->parent); bt_sk(sk)->parent = NULL; sock_put(sk); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ad5b0ac1f9ce..87691404d0c6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -934,6 +934,14 @@ static void hci_req_directed_advertising(struct hci_request *req, return; memset(&cp, 0, sizeof(cp)); + + /* Some controllers might reject command if intervals are not + * within range for undirected advertising. + * BCM20702A0 is known to be affected by this. + */ + cp.min_interval = cpu_to_le16(0x0020); + cp.max_interval = cpu_to_le16(0x0020); + cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; @@ -1168,8 +1176,10 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, if (!conn) return ERR_PTR(-ENOMEM); - if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) + if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { + hci_conn_del(conn); return ERR_PTR(-EBUSY); + } conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 04bc79359a17..9e19d5a3aac8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -842,8 +842,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { struct hci_cp_le_write_def_data_len cp; - cp.tx_len = hdev->le_max_tx_len; - cp.tx_time = hdev->le_max_tx_time; + cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); + cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); } @@ -1444,11 +1444,20 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_SETUP) || test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { + bool invalid_bdaddr; + hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); + /* The transport driver can set the quirk to mark the + * BD_ADDR invalid before creating the HCI device or in + * its setup callback. + */ + invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, + &hdev->quirks); + if (ret) goto setup_failed; @@ -1457,20 +1466,33 @@ static int hci_dev_do_open(struct hci_dev *hdev) hci_dev_get_bd_addr_from_property(hdev); if (bacmp(&hdev->public_addr, BDADDR_ANY) && - hdev->set_bdaddr) + hdev->set_bdaddr) { ret = hdev->set_bdaddr(hdev, &hdev->public_addr); + + /* If setting of the BD_ADDR from the device + * property succeeds, then treat the address + * as valid even if the invalid BD_ADDR + * quirk indicates otherwise. + */ + if (!ret) + invalid_bdaddr = false; + } } setup_failed: /* The transport driver can set these quirks before * creating the HCI device or in its setup callback. * + * For the invalid BD_ADDR quirk it is possible that + * it becomes a valid address if the bootloader does + * provide it (see above). + * * In case any of them is set, the controller has to * start up as unconfigured. */ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || - test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks)) + invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to @@ -4440,7 +4462,14 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + /* If the device has been opened in HCI_USER_CHANNEL, + * the userspace has exclusive access to device. + * When device is HCI_INIT, we still need to process + * the data packets to the driver in order + * to complete its setup(). + */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 7f6a581b5b7e..2a1b64dbf76e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -904,9 +904,9 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -923,9 +923,9 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) u8 instance = hdev->cur_adv_instance; struct adv_info *adv_instance; - /* Ignore instance 0 */ + /* Instance 0x00 always set local name */ if (instance == 0x00) - return 0; + return 1; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) @@ -1273,6 +1273,14 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) instance_flags = get_adv_instance_flags(hdev, instance); + /* If instance already has the flags set skip adding it once + * again. + */ + if (adv_instance && eir_get_data(adv_instance->adv_data, + adv_instance->adv_data_len, EIR_FLAGS, + NULL)) + goto skip_flags; + /* The Add Advertising command allows userspace to set both the general * and limited discoverable flags. */ @@ -1305,6 +1313,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) } } +skip_flags: if (adv_instance) { memcpy(ptr, adv_instance->adv_data, adv_instance->adv_data_len); @@ -1690,7 +1699,7 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) * scheduling it. */ if (adv_instance && adv_instance->duration) { - u16 duration = adv_instance->duration * MSEC_PER_SEC; + u16 duration = adv_instance->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ adv_set->duration = cpu_to_le16(duration / 10); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index da7fdbdf9c41..a845786258a0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4936,10 +4936,8 @@ void __l2cap_physical_cfm(struct l2cap_chan *chan, int result) BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d", chan, result, local_amp_id, remote_amp_id); - if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) { - l2cap_chan_unlock(chan); + if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) return; - } if (chan->state != BT_CONNECTED) { l2cap_do_create(chan, result, local_amp_id, remote_amp_id); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 26e8cfad22b8..6b42be4b5861 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -502,15 +502,12 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; u8 hash[3]; int err; if (!chan || !chan->data) return false; - smp = chan->data; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); @@ -523,14 +520,11 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; - struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; - smp = chan->data; - get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1153bbcdff72..915c2d6f7fb9 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -105,6 +105,40 @@ out: return err; } +/* Integer types of various sizes and pointer combinations cover variety of + * architecture dependent calling conventions. 7+ can be supported in the + * future. + */ +int noinline bpf_fentry_test1(int a) +{ + return a + 1; +} + +int noinline bpf_fentry_test2(int a, u64 b) +{ + return a + b; +} + +int noinline bpf_fentry_test3(char a, int b, u64 c) +{ + return a + b + c; +} + +int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +{ + return (long)a + b + c + d; +} + +int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +{ + return a + (long)b + c + d + e; +} + +int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +{ + return a + (long)b + c + d + (long)e + f; +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { @@ -122,6 +156,15 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, kfree(data); return ERR_PTR(-EFAULT); } + if (bpf_fentry_test1(1) != 2 || + bpf_fentry_test2(2, 3) != 5 || + bpf_fentry_test3(4, 5, 6) != 15 || + bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || + bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) { + kfree(data); + return ERR_PTR(-EFAULT); + } return data; } @@ -218,10 +261,18 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + FIELD_SIZEOF(struct __sk_buff, cb), + offsetof(struct __sk_buff, tstamp))) + return -EINVAL; + + /* tstamp is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + + FIELD_SIZEOF(struct __sk_buff, tstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->priority = __skb->priority; + skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); return 0; @@ -235,6 +286,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) return; __skb->priority = skb->priority; + __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e804a3016902..434effde02c3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -263,6 +263,37 @@ static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) strlcpy(info->bus_info, "N/A", sizeof(info->bus_info)); } +static int br_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p; + + cmd->base.duplex = DUPLEX_UNKNOWN; + cmd->base.port = PORT_OTHER; + cmd->base.speed = SPEED_UNKNOWN; + + list_for_each_entry(p, &br->port_list, list) { + struct ethtool_link_ksettings ecmd; + struct net_device *pdev = p->dev; + + if (!netif_running(pdev) || !netif_oper_up(pdev)) + continue; + + if (__ethtool_get_link_ksettings(pdev, &ecmd)) + continue; + + if (ecmd.base.speed == (__u32)SPEED_UNKNOWN) + continue; + + if (cmd->base.speed == (__u32)SPEED_UNKNOWN || + cmd->base.speed < ecmd.base.speed) + cmd->base.speed = ecmd.base.speed; + } + + return 0; +} + static netdev_features_t br_fix_features(struct net_device *dev, netdev_features_t features) { @@ -365,8 +396,9 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) } static const struct ethtool_ops br_ethtool_ops = { - .get_drvinfo = br_getinfo, - .get_link = ethtool_op_get_link, + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_link_ksettings = br_get_link_ksettings, }; static const struct net_device_ops br_netdev_ops = { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b1d3248c0252..4877a0db16c6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -75,8 +75,9 @@ static inline unsigned long hold_time(const struct net_bridge *br) static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - return !fdb->is_static && !fdb->added_by_external_learn && - time_before_eq(fdb->updated + hold_time(br), jiffies); + return !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && + time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) @@ -197,7 +198,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, { trace_fdb_delete(br, f); - if (f->is_static) + if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); @@ -224,7 +225,7 @@ static void fdb_delete_local(struct net_bridge *br, if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } @@ -235,7 +236,7 @@ static void fdb_delete_local(struct net_bridge *br, if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; - f->added_by_user = 0; + clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -250,7 +251,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); - if (f && f->is_local && !f->added_by_user && f->dst == p) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } @@ -265,7 +267,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { - if (f->dst == p && f->is_local && !f->added_by_user) { + if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && + !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); @@ -306,7 +309,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -321,7 +325,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst && !f->added_by_user) + if (f && test_bit(BR_FDB_LOCAL, &f->flags) && + !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -346,7 +351,8 @@ void br_fdb_cleanup(struct work_struct *work) hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer; - if (f->is_static || f->added_by_external_learn) + if (test_bit(BR_FDB_STATIC, &f->flags) || + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) continue; this_timer = f->updated + delay; if (time_after(this_timer, now)) { @@ -373,7 +379,7 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); @@ -397,10 +403,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, continue; if (!do_all) - if (f->is_static || (vid && f->key.vlan_id != vid)) + if (test_bit(BR_FDB_STATIC, &f->flags) || + (vid && f->key.vlan_id != vid)) continue; - if (f->is_local) + if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); @@ -469,8 +476,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) + fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); + if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; @@ -484,8 +491,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, - unsigned char is_local, - unsigned char is_static) + unsigned long flags) { struct net_bridge_fdb_entry *fdb; @@ -494,12 +500,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; fdb->key.vlan_id = vid; - fdb->is_local = is_local; - fdb->is_static = is_static; - fdb->added_by_user = 0; - fdb->added_by_external_learn = 0; - fdb->offloaded = 0; - fdb->is_sticky = 0; + fdb->flags = flags; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, @@ -526,14 +527,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, /* it is okay to have multiple ports with same * address, just use the first one. */ - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } - fdb = fdb_create(br, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, + BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; @@ -555,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user) + const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -564,15 +566,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, if (hold_time(br) == 0) return; - /* ignore packets unless we are using this port */ - if (!(source->state == BR_STATE_LEARNING || - source->state == BR_STATE_FORWARDING)) - return; - fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ - if (unlikely(fdb->is_local)) { + if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); @@ -580,30 +577,30 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && !fdb->is_sticky)) { + if (unlikely(source != fdb->dst && + !test_bit(BR_FDB_STICKY, &fdb->flags))) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ - if (unlikely(fdb->added_by_external_learn)) - fdb->added_by_external_learn = 0; + if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags))) + clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, + &fdb->flags); } if (now != fdb->updated) fdb->updated = now; - if (unlikely(added_by_user)) - fdb->added_by_user = 1; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { - trace_br_fdb_update(br, source, addr, vid, added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, - added_by_user); + trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts @@ -616,9 +613,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - if (fdb->is_local) + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; - else if (fdb->is_static) + else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; @@ -648,11 +645,11 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); - if (fdb->offloaded) + if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; - if (fdb->added_by_external_learn) + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (fdb->is_sticky) + if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) @@ -799,7 +796,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, u8 ndm_flags) { - u8 is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -823,7 +820,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(br, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; @@ -840,34 +837,28 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { - fdb->is_local = 1; - if (!fdb->is_static) { - fdb->is_static = 1; + set_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else if (state & NUD_NOARP) { - fdb->is_local = 0; - if (!fdb->is_static) { - fdb->is_static = 1; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); - } } else { - fdb->is_local = 0; - if (fdb->is_static) { - fdb->is_static = 0; + clear_bit(BR_FDB_LOCAL, &fdb->flags); + if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); - } } modified = true; } - if (is_sticky != fdb->is_sticky) { - fdb->is_sticky = is_sticky; + if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { + change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { @@ -890,9 +881,12 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, br->dev->name); return -EINVAL; } + if (!nbp_state_should_learn(p)) + return 0; + local_bh_disable(); rcu_read_lock(); - br_fdb_update(br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { @@ -1064,7 +1058,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) @@ -1078,7 +1072,7 @@ done: rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!tmp->is_static) + if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; @@ -1097,7 +1091,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ - if (!f->is_static) + if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); @@ -1119,14 +1113,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(br, p, addr, vid, 0, 0); + unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); + + if (swdev_notify) + flags |= BIT(BR_FDB_ADDED_BY_USER); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } - if (swdev_notify) - fdb->added_by_user = 1; - fdb->added_by_external_learn = 1; fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; @@ -1136,17 +1131,17 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (fdb->added_by_external_learn) { + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!fdb->added_by_user) { + } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ - fdb->added_by_external_learn = 1; + set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (swdev_notify) - fdb->added_by_user = 1; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); @@ -1168,7 +1163,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb && fdb->added_by_external_learn) + if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; @@ -1186,8 +1181,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); - if (fdb) - fdb->offloaded = offloaded; + if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) + change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } @@ -1206,7 +1201,7 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid) spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) - f->offloaded = 0; + clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 09b1dd8cd853..8944ceb47fe9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -88,7 +88,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -151,7 +151,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { unsigned long now = jiffies; - if (dst->is_local) + if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) @@ -182,9 +182,10 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && + nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ce2ab14ee605..36b0367ca1e0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -172,6 +172,16 @@ struct net_bridge_vlan_group { u16 pvid; }; +/* bridge fdb flags */ +enum { + BR_FDB_LOCAL, + BR_FDB_STATIC, + BR_FDB_STICKY, + BR_FDB_ADDED_BY_USER, + BR_FDB_ADDED_BY_EXT_LEARN, + BR_FDB_OFFLOADED, +}; + struct net_bridge_fdb_key { mac_addr addr; u16 vlan_id; @@ -183,12 +193,7 @@ struct net_bridge_fdb_entry { struct net_bridge_fdb_key key; struct hlist_node fdb_node; - unsigned char is_local:1, - is_static:1, - is_sticky:1, - added_by_user:1, - added_by_external_learn:1, - offloaded:1; + unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; @@ -495,6 +500,11 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) return true; } +static inline bool nbp_state_should_learn(const struct net_bridge_port *p) +{ + return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { @@ -566,7 +576,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user); + const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 921310d3cbae..015209bf44aa 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -129,15 +129,19 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user, - fdb->offloaded); + test_bit(BR_FDB_ADDED_BY_USER, + &fdb->flags), + test_bit(BR_FDB_OFFLOADED, + &fdb->flags)); break; } } diff --git a/net/caif/Kconfig b/net/caif/Kconfig index eb83051c8330..b7532a79ca7a 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -13,11 +13,11 @@ menuconfig CAIF with its modems. It is accessed from user space as sockets (PF_CAIF). Say Y (or M) here if you build for a phone product (e.g. Android or - MeeGo ) that uses CAIF as transport, if unsure say N. + MeeGo) that uses CAIF as transport. If unsure say N. If you select to build it as module then CAIF_NETDEV also needs to be - built as modules. You will also need to say yes to any CAIF physical - devices that your platform requires. + built as a module. You will also need to say Y (or M) to any CAIF + physical devices that your platform requires. See Documentation/networking/caif for a further explanation on how to use and configure CAIF. @@ -37,7 +37,7 @@ config CAIF_NETDEV default CAIF ---help--- Say Y if you will be using a CAIF based GPRS network device. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say Y. @@ -48,7 +48,7 @@ config CAIF_USB default n ---help--- Say Y if you are using CAIF over USB CDC NCM. - This can be either built-in or a loadable module, + This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say N. diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index da5639a5bd3b..458be6b3eda9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -798,7 +798,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ - map = bpf_map_inc_not_zero(&smap->map, false); + map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; diff --git a/net/core/dev.c b/net/core/dev.c index 99ac84ff398f..c7fc902ccbdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -229,6 +229,122 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -238,7 +354,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -256,7 +372,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -652,14 +768,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -677,14 +789,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -1060,8 +1168,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1077,7 +1185,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device @@ -1151,13 +1258,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1536,6 +1643,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1554,8 +1717,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1568,17 +1729,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1587,22 +1740,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } - -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1653,6 +1793,80 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto chain_unregister; + +unlock: + rtnl_unlock(); + return err; + +chain_unregister: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + goto unlock; + + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +/** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data @@ -1664,7 +1878,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -2690,7 +2915,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); @@ -2858,12 +3083,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -2898,12 +3120,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); @@ -3386,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && qdisc_run_begin(q)) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { @@ -5365,7 +5586,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5390,8 +5611,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && + if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, @@ -5582,6 +5802,26 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -5589,12 +5829,13 @@ static void napi_skb_free_stolen_head(struct sk_buff *skb) kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5626,7 +5867,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5672,26 +5913,6 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) @@ -8532,6 +8753,9 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -9011,6 +9235,11 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -9132,6 +9361,8 @@ out: return ret; err_uninit: + if (dev->name_node) + netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) @@ -9946,6 +10177,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: diff --git a/net/core/devlink.c b/net/core/devlink.c index 93905dc7c179..4c63c9a4c09e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -95,16 +95,25 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -static struct net *devlink_net(const struct devlink *devlink) +struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } +EXPORT_SYMBOL_GPL(devlink_net); -static void devlink_net_set(struct devlink *devlink, struct net *net) +static void __devlink_net_set(struct devlink *devlink, struct net *net) { write_pnet(&devlink->_net, net); } +void devlink_net_set(struct devlink *devlink, struct net *net) +{ + if (WARN_ON(devlink->registered)) + return; + __devlink_net_set(devlink, net); +} +EXPORT_SYMBOL_GPL(devlink_net_set); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -434,8 +443,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - devlink = devlink_get_from_info(info); - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + /* When devlink changes netns, it would not be found + * by devlink_get_from_info(). So try if it is stored first. + */ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + devlink = info->user_ptr[0]; + } else { + devlink = devlink_get_from_info(info); + WARN_ON(IS_ERR(devlink)); + } + if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -1035,7 +1052,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1058,6 +1075,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1233,7 +1253,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1256,6 +1276,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1460,7 +1483,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1485,6 +1508,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -2674,6 +2700,72 @@ devlink_resources_validate(struct devlink *devlink, return err; } +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd); + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *dest_net) +{ + struct devlink_param_item *param_item; + + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); + + __devlink_net_set(devlink, dest_net); + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); +} + static bool devlink_reload_supported(struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; @@ -2694,9 +2786,30 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static int devlink_reload(struct devlink *devlink, struct net *dest_net, + struct netlink_ext_ack *extack) +{ + int err; + + if (!devlink->reload_enabled) + return -EOPNOTSUPP; + + err = devlink->ops->reload_down(devlink, !!dest_net, extack); + if (err) + return err; + + if (dest_net && !net_eq(dest_net, devlink_net(devlink))) + devlink_reload_netns_change(devlink, dest_net); + + err = devlink->ops->reload_up(devlink, extack); + devlink_reload_failed_set(devlink, !!err); + return err; +} + static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct net *dest_net = NULL; int err; if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) @@ -2707,11 +2820,20 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - err = devlink->ops->reload_down(devlink, info->extack); - if (err) - return err; - err = devlink->ops->reload_up(devlink, info->extack); - devlink_reload_failed_set(devlink, !!err); + + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, info->extack); + + if (dest_net) + put_net(dest_net); + return err; } @@ -2884,6 +3006,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -3155,7 +3282,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3183,6 +3310,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3411,7 +3541,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3444,6 +3574,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3818,29 +3951,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = 0; + struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto out_free; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) { @@ -3917,7 +4040,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); - kfree(attrs); return skb->len; @@ -3927,8 +4049,6 @@ out_unlock: mutex_unlock(&devlink->lock); out_dev: mutex_unlock(&devlink_mutex); -out_free: - kfree(attrs); return err; } @@ -4066,7 +4186,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -4094,6 +4214,9 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -4296,12 +4419,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) +static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) @@ -4409,19 +4531,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) + const void *value, u32 value_len) { + u32 data_size; + u32 offset; int err; - err = devlink_fmsg_pair_nest_start(fmsg, name); + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); if (err) return err; - err = devlink_fmsg_binary_put(fmsg, value, value_len); - if (err) - return err; + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > DEVLINK_FMSG_MAX_SIZE) + data_size = DEVLINK_FMSG_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + return err; + } - err = devlink_fmsg_pair_nest_end(fmsg); + err = devlink_fmsg_arr_pair_nest_end(fmsg); if (err) return err; @@ -4733,14 +4862,17 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, struct netlink_ext_ack *extack) { int err; + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + if (!reporter->ops->recover) return -EOPNOTSUPP; - err = reporter->ops->recover(reporter, priv_ctx); + err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; @@ -4761,7 +4893,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter) } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { int err; @@ -4782,7 +4915,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx); + priv_ctx, extack); if (err) goto dump_err; @@ -4830,11 +4963,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); + devlink_health_do_dump(reporter, priv_ctx, NULL); mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, + priv_ctx, NULL); return 0; } @@ -4869,21 +5003,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; struct devlink *devlink; - struct nlattr **attrs; - int err; - - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return NULL; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto free; mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); @@ -4892,12 +5015,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); mutex_unlock(&devlink_mutex); - kfree(attrs); return reporter; unlock: mutex_unlock(&devlink_mutex); -free: - kfree(attrs); return NULL; } @@ -5090,7 +5210,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL, info->extack); devlink_health_reporter_put(reporter); return err; @@ -5123,7 +5243,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (err) goto out; - err = reporter->ops->diagnose(reporter, fmsg); + err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; @@ -5158,7 +5278,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, } mutex_lock(&reporter->dump_lock); if (!start) { - err = devlink_health_do_dump(reporter, NULL); + err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; cb->args[1] = reporter->dump_ts; @@ -5799,6 +5919,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6029,7 +6152,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, @@ -6077,7 +6201,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | @@ -6161,7 +6286,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; - devlink_net_set(devlink, &init_net); + __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); @@ -6187,6 +6312,7 @@ int devlink_register(struct devlink *devlink, struct device *dev) { mutex_lock(&devlink_mutex); devlink->dev = dev; + devlink->registered = true; list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -7533,6 +7659,21 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8103,9 +8244,43 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (net_eq(devlink_net(devlink), net)) { + if (WARN_ON(!devlink_reload_supported(devlink))) + continue; + err = devlink_reload(devlink, &init_net, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } + } + mutex_unlock(&devlink_mutex); +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + static int __init devlink_init(void) { - return genl_register_family(&devlink_nl_family); + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; } subsys_initcall(devlink_init); diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 470a606d5e8d..fc96259807b6 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -12,17 +12,15 @@ static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; }; -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; - info->net = net; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } @@ -31,106 +29,100 @@ EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; - info->net = net; - err = atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -static unsigned int fib_seq_sum(void) +static unsigned int fib_seq_sum(struct net *net) { - struct fib_notifier_net *fn_net; + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - struct net *net; rtnl_lock(); - down_read(&net_rwsem); - for_each_net(net) { - fn_net = net_generic(net, fib_notifier_net_id); - rcu_read_lock(); - list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - if (!try_module_get(ops->owner)) - continue; - fib_seq += ops->fib_seq_read(net); - module_put(ops->owner); - } - rcu_read_unlock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); } - up_read(&net_rwsem); + rcu_read_unlock(); rtnl_unlock(); return fib_seq; } -static int fib_net_dump(struct net *net, struct notifier_block *nb) +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; + int err = 0; + rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - int err; - if (!try_module_get(ops->owner)) continue; - err = ops->fib_dump(net, nb); + err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) - return err; + goto unlock; } - return 0; +unlock: + rcu_read_unlock(); + + return err; } -static bool fib_dump_is_consistent(struct notifier_block *nb, +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) return true; - atomic_notifier_chain_unregister(&fib_chain, nb); + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) { int retries = 0; int err; do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - rcu_read_lock(); - for_each_net_rcu(net) { - err = fib_net_dump(net, nb); - if (err) - goto err_fib_net_dump; - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; - -err_fib_net_dump: - rcu_read_unlock(); - return err; } EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); @@ -181,6 +173,7 @@ static int __net_init fib_notifier_net_init(struct net *net) struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index dd220ce7ca7a..3e7e15278c46 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -321,16 +321,18 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib_rule *rule, int family) + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, + .info.extack = extack, .rule = rule, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, @@ -350,20 +352,25 @@ static int call_fib_rule_notifiers(struct net *net, } /* Called with rcu_read_lock() */ -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; + int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, - family); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } rules_ops_put(ops); - return 0; + return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); diff --git a/net/core/filter.c b/net/core/filter.c index 9ad29f576f1a..b0ed048585ba 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2245,7 +2245,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg.copy[i] && bytes_sg_total <= len) + if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2450,7 +2450,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - msg->sg.copy[new] = false; + __clear_bit(new, &msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -8671,16 +8684,6 @@ out: } #ifdef CONFIG_INET -struct sk_reuseport_kern { - struct sk_buff *skb; - struct sock *sk; - struct sock *selected_sk; - void *data_end; - u32 hash; - u32 reuseport_id; - bool bind_inany; -}; - static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 68eda10d0680..ca871657a4c4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,19 +114,50 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, { struct bpf_prog *attached; struct net *net; + int ret = 0; net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); + if (attached) + bpf_prog_put(attached); +out: mutex_unlock(&flow_dissector_mutex); - return 0; + return ret; } int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) @@ -147,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -203,6 +213,72 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @toff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -853,7 +929,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -910,7 +985,10 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(init_net.flow_dissector_prog); + + if (!attached) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { struct bpf_flow_keys flow_keys; @@ -1295,6 +1373,12 @@ ip_proto_again: data, nhoff, hlen); break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + default: break; } @@ -1308,14 +1392,6 @@ ip_proto_again: data, hlen); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } - /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: @@ -1365,8 +1441,8 @@ static const void *flow_keys_hash_start(const struct flow_keys *flow) static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); - BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != - sizeof(*flow) - sizeof(flow->addrs)); + + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: @@ -1412,6 +1488,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index bfe7bdd4c340..80dbf2f4016e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -48,7 +48,7 @@ struct net_rate_estimator { u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; - u32 last_packets; + u64 last_packets; u64 last_bytes; u64 avpps; @@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t) brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); + rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 36888f5e09eb..1d653fbfcf52 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - u64 bytes; - u32 packets; + u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); @@ -176,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running, if (d->tail) { struct gnet_stats_basic sb; + int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, type, &sb, sizeof(sb), - TCA_STATS_PAD); + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats.packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, + sizeof(bstats.packets), TCA_STATS_PAD); } return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5480edff0c86..652da6369037 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1197,7 +1197,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1476,7 +1476,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { @@ -2052,8 +2052,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 36347933ec3a..6bbd06f7dc7d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff struct hlist_head *h; unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { if (++count == offset) return dev; } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5bc65587f1c4..a6aefe989043 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,9 @@ #include <trace/events/page_pool.h> +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -112,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -156,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } page->dma_addr = dma; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -193,22 +224,14 @@ static s32 page_pool_inflight(struct page_pool *pool) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; + s32 inflight; - distance = _distance(hold_cnt, release_cnt); + inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); - return distance; -} - -static bool __page_pool_safe_to_destroy(struct page_pool *pool) -{ - s32 inflight = page_pool_inflight(pool); - - /* The distance should not be able to become negative */ + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); - return (inflight == 0); + return inflight; } /* Cleanup page_pool state from page */ @@ -216,6 +239,7 @@ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; + int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) goto skip_dma_unmap; @@ -227,9 +251,11 @@ static void __page_pool_clean_page(struct page_pool *pool, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; skip_dma_unmap: - atomic_inc(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, - atomic_read(&pool->pages_state_release_cnt)); + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); } /* unmap the page and clean our state */ @@ -283,8 +309,19 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + * 2) belongs to a different NUMA node than pool->p.nid. + * + * To update pool->p.nid users must call page_pool_update_nid. + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ + return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; +} + +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -292,9 +329,14 @@ void __page_pool_put_page(struct page_pool *pool, * * refcnt == 1 means page_pool owns page, and can recycle it. */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && + pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + if (allow_direct && in_serving_softirq()) if (__page_pool_recycle_direct(page, pool)) return; @@ -338,31 +380,10 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __warn_in_flight(struct page_pool *pool) +static void page_pool_free(struct page_pool *pool) { - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - /* Drivers should fix this, but only problematic when DMA is used */ - WARN(1, "Still in-flight pages:%d hold:%u released:%u", - distance, hold_cnt, release_cnt); -} - -void __page_pool_free(struct page_pool *pool) -{ - /* Only last user actually free/release resources */ - if (!page_pool_put(pool)) - return; - - WARN(pool->alloc.count, "API usage violation"); - WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); - - /* Can happen due to forced shutdown */ - if (!__page_pool_safe_to_destroy(pool)) - __warn_in_flight(pool); + if (pool->disconnect) + pool->disconnect(pool); ptr_ring_cleanup(&pool->ring, NULL); @@ -371,15 +392,14 @@ void __page_pool_free(struct page_pool *pool) kfree(pool); } -EXPORT_SYMBOL(__page_pool_free); -/* Request to shutdown: release pages cached by page_pool, and check - * for in-flight pages - */ -bool __page_pool_request_shutdown(struct page_pool *pool) +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { struct page *page; + if (pool->destroy_cnt) + return; + /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. @@ -388,12 +408,83 @@ bool __page_pool_request_shutdown(struct page_pool *pool) page = pool->alloc.cache[--pool->alloc.count]; __page_pool_return_page(pool, page); } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + page_pool_empty_alloc_cache_once(pool); + pool->destroy_cnt++; /* No more consumers should exist, but producers could still * be in-flight. */ __page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool); + if (!inflight) + page_pool_free(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + int inflight; + + inflight = page_pool_release(pool); + if (!inflight) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, pool->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", + __func__, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} - return __page_pool_safe_to_destroy(pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; +} + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + if (!page_pool_release(pool)) + return; + + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; } -EXPORT_SYMBOL(__page_pool_request_shutdown); +EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 48b1e429857c..294bfcf0ce0e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3404,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) HARD_TX_LOCK(odev, txq, smp_processor_id()); if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { - ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e4ec575c1fba..9f7aa448bd11 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void) return xdp_size; } +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1027,6 +1040,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + 0; } @@ -1584,6 +1598,42 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1697,6 +1747,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure_rcu; rcu_read_unlock(); + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1750,6 +1803,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2744,6 +2800,26 @@ errout: return err; } +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *ifname_attr, + struct nlattr *altifname_attr, + char *ifname) +{ + char buffer[ALTIFNAMSIZ]; + + if (!ifname) { + ifname = buffer; + if (ifname_attr) + nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else if (altifname_attr) + nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + else + return NULL; + } + + return __dev_get_by_name(net, ifname); +} + static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2772,8 +2848,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else goto errout; @@ -2846,7 +2922,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; @@ -2860,9 +2935,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); @@ -2874,8 +2946,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -3046,12 +3119,10 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - } + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); + else + dev = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); @@ -3313,6 +3384,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, switch (i) { case IFLA_IFNAME: + case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; @@ -3331,7 +3403,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -3354,9 +3425,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -3364,8 +3432,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else goto out; @@ -3395,6 +3464,100 @@ out: return err; } +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + alt_ifname = nla_data(attr); + if (cmd == RTM_NEWLINKPROP) { + alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + err = netdev_name_node_alt_create(dev, alt_ifname); + if (err) { + kfree(alt_ifname); + return err; + } + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + if (err) + return err; + } else { + WARN_ON(1); + return 0; + } + + *changed = true; + return 0; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -5353,6 +5516,9 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ad31e4e53d0a..a469d2124f3f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -793,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk) static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; - void (*write_space)(struct sock *sk); + void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); - if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) - schedule_work(&psock->work); - write_space = psock->saved_write_space; + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } rcu_read_unlock(); - write_space(sk); + if (write_space) + write_space(sk); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/sock.c b/net/core/sock.c index ac78a570e43a..71787f7c4f8c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; - int size; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; @@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; - size = sizeof(old_tv); - } else { - *(struct __kernel_sock_timeval *)optval = tv; - size = sizeof(tv); + return sizeof(old_tv); } - return size; + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); } static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) @@ -687,7 +684,8 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -3015,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt);; + sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } @@ -3042,7 +3040,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, } EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; diff --git a/net/core/xdp.c b/net/core/xdp.c index d7bf62ffbb5e..e334fad0a6b8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -70,77 +70,63 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); - /* Allocator have indicated safe to remove before this is called */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - page_pool_free(xa->page_pool); - /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Poison memory */ - xa->mem.id = 0xFFFF; - xa->mem.type = 0xF0F0; - xa->allocator = (void *)0xDEAD9001; - kfree(xa); } -static bool __mem_id_disconnect(int id, bool force) +static void mem_xa_remove(struct xdp_mem_allocator *xa) { - struct xdp_mem_allocator *xa; - bool safe_to_remove = true; + trace_mem_disconnect(xa); mutex_lock(&mem_id_lock); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return true; - } - xa->disconnect_cnt++; - - /* Detects in-flight packet-pages for page_pool */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - safe_to_remove = page_pool_request_shutdown(xa->page_pool); - - trace_mem_disconnect(xa, safe_to_remove, force); - - if ((safe_to_remove || force) && - !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); mutex_unlock(&mem_id_lock); - return (safe_to_remove|force); } -#define DEFER_TIME (msecs_to_jiffies(1000)) -#define DEFER_WARN_INTERVAL (30 * HZ) -#define DEFER_MAX_RETRIES 120 +static void mem_allocator_disconnect(void *allocator) +{ + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); + + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } + + rhashtable_walk_stop(&iter); -static void mem_id_disconnect_defer_retry(struct work_struct *wq) + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); +} + +static void mem_id_disconnect(int id) { - struct delayed_work *dwq = to_delayed_work(wq); - struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); - bool force = false; + struct xdp_mem_allocator *xa; - if (xa->disconnect_cnt > DEFER_MAX_RETRIES) - force = true; + mutex_lock(&mem_id_lock); - if (__mem_id_disconnect(xa->mem.id, force)) + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); return; + } - /* Periodic warning */ - if (time_after_eq(jiffies, xa->defer_warn)) { - int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + trace_mem_disconnect(xa); - pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", - __func__, xa->mem.id, xa->disconnect_cnt, sec); - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - } + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - /* Still not ready to be disconnected, retry later */ - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); + mutex_unlock(&mem_id_lock); } void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) @@ -153,38 +139,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) return; } - if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && - xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { - return; - } - if (id == 0) return; - if (__mem_id_disconnect(id, false)) - return; - - /* Could not disconnect, defer new disconnect attempt to later */ - mutex_lock(&mem_id_lock); + if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) + return mem_id_disconnect(id); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; + if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + rcu_read_unlock(); } - xa->defer_start = jiffies; - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - - INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); - mutex_unlock(&mem_id_lock); - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); -/* This unregister operation will also cleanup and destroy the - * allocator. The page_pool_free() operation is first called when it's - * safe to remove, possibly deferred to a workqueue. - */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -371,7 +340,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, } if (type == MEM_TYPE_PAGE_POOL) - page_pool_get(xdp_alloc->page_pool); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); mutex_unlock(&mem_id_lock); @@ -386,7 +355,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame - * while still leveraging this protection. The @napi_direct boolian + * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ @@ -402,15 +371,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (likely(xa)) { - napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); - } else { - /* Hopefully stack show who to blame for late return */ - WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); - trace_mem_return_failed(mem, page); - put_page(page); - } + napi_direct &= !xdp_return_frame_no_direct(); + page_pool_put_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5bad08dc4316..a52e8ba1ced0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3349ea81f901..e19a92a62e14 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1091,7 +1091,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } cb = DN_SKB_CB(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); if (newsk == NULL) { release_sock(sk); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index e4161e0c86aa..c68503a18025 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -328,7 +328,7 @@ static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) return; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 29e2bd5cc5af..1e6c3cac11e6 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -20,7 +20,7 @@ if NET_DSA # tagging formats config NET_DSA_TAG_8021Q - tristate "Tag driver for switches using custom 802.1Q VLAN headers" + tristate select VLAN_8021Q help Unlike the other tagging protocols, the 802.1Q config option simply @@ -79,6 +79,13 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. +config NET_DSA_TAG_OCELOT + tristate "Tag driver for Ocelot family of switches" + select PACKING + help + Say Y or M if you want to enable support for tagging frames for the + Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959). + config NET_DSA_TAG_QCA tristate "Tag driver for Qualcomm Atheros QCA8K switches" help diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 2c6d286f0511..9a482c38bdb1 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 43120a3fb06f..17281fec710c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return dsa_is_user_port(ds, p) && ds->ports[p].slave; + const struct dsa_port *dp = dsa_to_port(ds, p); + + return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].slave); + ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].slave); + ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); if (ret) return ret; } @@ -329,6 +331,91 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_dsa_notifiers); +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_get(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_get); + +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_devlink_priv *dl_priv; + struct dsa_switch *ds; + + dl_priv = devlink_priv(dl); + ds = dl_priv->ds; + + if (!ds->ops->devlink_param_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_set(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_set); + +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + return devlink_params_register(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_register); + +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + devlink_params_unregister(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); + +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + return devlink_resource_register(ds->devlink, resource_name, + resource_size, resource_id, + parent_resource_id, + size_params); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds) +{ + devlink_resources_unregister(ds->devlink, NULL); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + return devlink_resource_occ_get_register(ds->devlink, resource_id, + occ_get, occ_get_priv); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); + +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id) +{ + devlink_resource_occ_get_unregister(ds->devlink, resource_id); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 716d265ba8ca..9ef2caa13f27 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -45,6 +45,10 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) dst->index = index; + INIT_LIST_HEAD(&dst->rtable); + + INIT_LIST_HEAD(&dst->ports); + INIT_LIST_HEAD(&dst->list); list_add_tail(&dst->list, &dsa_tree_list); @@ -111,24 +115,38 @@ static bool dsa_port_is_user(struct dsa_port *dp) static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + list_for_each_entry(dp, &dst->ports, list) + if (dp->dn == dn) + return dp; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + return NULL; +} - if (dp->dn == dn) - return dp; - } - } +struct dsa_link *dsa_link_touch(struct dsa_port *dp, struct dsa_port *link_dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst; + struct dsa_link *dl; - return NULL; + dst = ds->dst; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp == dp && dl->link_dp == link_dp) + return dl; + + dl = kzalloc(sizeof(*dl), GFP_KERNEL); + if (!dl) + return NULL; + + dl->dp = dp; + dl->link_dp = link_dp; + + INIT_LIST_HEAD(&dl->list); + list_add_tail(&dl->list, &dst->rtable); + + return dl; } static bool dsa_port_setup_routing_table(struct dsa_port *dp) @@ -138,6 +156,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) struct device_node *dn = dp->dn; struct of_phandle_iterator it; struct dsa_port *link_dp; + struct dsa_link *dl; int err; of_for_each_phandle(&it, err, dn, "link", NULL, 0) { @@ -147,24 +166,22 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp) return false; } - ds->rtable[link_dp->ds->index] = dp->index; + dl = dsa_link_touch(dp, link_dp); + if (!dl) { + of_node_put(it.node); + return false; + } } return true; } -static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { bool complete = true; struct dsa_port *dp; - int i; - - for (i = 0; i < DSA_MAX_SWITCHES; i++) - ds->rtable[i] = DSA_RTABLE_NONE; - - for (i = 0; i < ds->num_ports; i++) { - dp = &ds->ports[i]; + list_for_each_entry(dp, &dst->ports, list) { if (dsa_port_is_dsa(dp)) { complete = dsa_port_setup_routing_table(dp); if (!complete) @@ -175,81 +192,42 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) return complete; } -static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - bool complete = true; - int device; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - complete = dsa_switch_setup_routing_table(ds); - if (!complete) - break; - } - - return complete; -} - static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_cpu(dp)) - return dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + return dp; return NULL; } static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - struct dsa_port *dp; - int device, port; + struct dsa_port *cpu_dp, *dp; - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = dsa_tree_find_first_cpu(dst); - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); + cpu_dp = dsa_tree_find_first_cpu(dst); + if (!cpu_dp) { + pr_err("DSA: tree %d has no CPU port\n", dst->index); return -EINVAL; } /* Assign the default CPU port to all ports of the fabric */ - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = dst->cpu_dp; - } - } + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; return 0; } static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) { - /* DSA currently only supports a single CPU port */ - dst->cpu_dp = NULL; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = NULL; } static int dsa_port_setup(struct dsa_port *dp) @@ -265,6 +243,9 @@ static int dsa_port_setup(struct dsa_port *dp) bool dsa_port_enabled = false; int err = 0; + if (dp->setup) + return 0; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -333,14 +314,21 @@ static int dsa_port_setup(struct dsa_port *dp) dsa_port_link_unregister_of(dp); if (err && devlink_port_registered) devlink_port_unregister(dlp); + if (err) + return err; - return err; + dp->setup = true; + + return 0; } static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + if (!dp->setup) + return; + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; @@ -363,11 +351,17 @@ static void dsa_port_teardown(struct dsa_port *dp) } break; } + + dp->setup = false; } static int dsa_switch_setup(struct dsa_switch *ds) { - int err = 0; + struct dsa_devlink_priv *dl_priv; + int err; + + if (ds->setup) + return 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -379,9 +373,11 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, 0); + ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); if (!ds->devlink) return -ENOMEM; + dl_priv = devlink_priv(ds->devlink); + dl_priv->ds = ds; err = devlink_register(ds->devlink, ds->dev); if (err) @@ -395,6 +391,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + devlink_params_publish(ds->devlink); + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { @@ -409,6 +407,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto unregister_notifier; } + ds->setup = true; + return 0; unregister_notifier: @@ -424,6 +424,9 @@ free_devlink: static void dsa_switch_teardown(struct dsa_switch *ds) { + if (!ds->setup) + return; + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); @@ -438,95 +441,72 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->devlink = NULL; } + ds->setup = false; } static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port, i; - int err = 0; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; + int err; - err = dsa_switch_setup(ds); + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_switch_setup(dp->ds); if (err) - goto switch_teardown; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + goto teardown; + } - err = dsa_port_setup(dp); - if (err) - goto ports_teardown; - } + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_port_setup(dp); + if (err) + goto teardown; } return 0; -ports_teardown: - for (i = 0; i < port; i++) - dsa_port_teardown(&ds->ports[i]); +teardown: + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_switch_teardown(ds); - -switch_teardown: - for (i = 0; i < device; i++) { - ds = dst->ds[i]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - - for (device = 0; device < DSA_MAX_SWITCHES; device++) { - ds = dst->ds[device]; - if (!ds) - continue; - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; + list_for_each_entry(dp, &dst->ports, list) + dsa_port_teardown(dp); - dsa_port_teardown(dp); - } - - dsa_switch_teardown(ds); - } + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); } static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; + int err; - /* DSA currently supports a single pair of CPU port and master device */ - return dsa_master_setup(master, cpu_dp); + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_cpu(dp)) { + err = dsa_master_setup(dp->master, dp); + if (err) + return err; + } + } + + return 0; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp = dst->cpu_dp; - struct net_device *master = cpu_dp->master; + struct dsa_port *dp; - return dsa_master_teardown(master); + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + dsa_master_teardown(dp->master); } static int dsa_tree_setup(struct dsa_switch_tree *dst) @@ -572,6 +552,8 @@ teardown_default_cpu: static void dsa_tree_teardown(struct dsa_switch_tree *dst) { + struct dsa_link *dl, *next; + if (!dst->setup) return; @@ -581,39 +563,36 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_default_cpu(dst); + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } + pr_info("DSA: tree %d torn down\n", dst->index); dst->setup = false; } -static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, - unsigned int index) +static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { - dsa_tree_teardown(dst); + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp; - dst->ds[index] = NULL; - dsa_tree_put(dst); -} + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == index) + return dp; -static int dsa_tree_add_switch(struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - unsigned int index = ds->index; - int err; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return NULL; - if (dst->ds[index]) - return -EBUSY; + dp->ds = ds; + dp->index = index; - dsa_tree_get(dst); - dst->ds[index] = ds; + INIT_LIST_HEAD(&dp->list); + list_add_tail(&dp->list, &dst->ports); - err = dsa_tree_setup(dst); - if (err) { - dst->ds[index] = NULL; - dsa_tree_put(dst); - } - - return err; + return dp; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -708,7 +687,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, goto out_put_node; } - dp = &ds->ports[reg]; + dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); if (err) @@ -732,8 +711,6 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return sz; ds->index = m[1]; - if (ds->index >= DSA_MAX_SWITCHES) - return -EINVAL; ds->dst = dsa_tree_touch(m[0]); if (!ds->dst) @@ -742,6 +719,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return 0; } +static int dsa_switch_touch_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int port; + + for (port = 0; port < ds->num_ports; port++) { + dp = dsa_port_touch(ds, port); + if (!dp) + return -ENOMEM; + } + + return 0; +} + static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) { int err; @@ -750,6 +741,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) if (err) return err; + err = dsa_switch_touch_ports(ds); + if (err) + return err; + return dsa_switch_parse_ports_of(ds, dn); } @@ -787,7 +782,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, for (i = 0; i < DSA_MAX_PORTS; i++) { name = cd->port_names[i]; dev = cd->netdev[i]; - dp = &ds->ports[i]; + dp = dsa_to_port(ds, i); if (!name) continue; @@ -807,6 +802,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds, static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) { + int err; + ds->cd = cd; /* We don't support interconnected switches nor multiple trees via @@ -817,22 +814,29 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) if (!ds->dst) return -ENOMEM; - return dsa_switch_parse_ports(ds, cd); -} - -static int dsa_switch_add(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst = ds->dst; + err = dsa_switch_touch_ports(ds); + if (err) + return err; - return dsa_tree_add_switch(dst, ds); + return dsa_switch_parse_ports(ds, cd); } static int dsa_switch_probe(struct dsa_switch *ds) { - struct dsa_chip_data *pdata = ds->dev->platform_data; - struct device_node *np = ds->dev->of_node; + struct dsa_switch_tree *dst; + struct dsa_chip_data *pdata; + struct device_node *np; int err; + if (!ds->dev) + return -ENODEV; + + pdata = ds->dev->platform_data; + np = ds->dev->of_node; + + if (!ds->num_ports) + return -EINVAL; + if (np) err = dsa_switch_parse_of(ds, np); else if (pdata) @@ -843,29 +847,14 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (err) return err; - return dsa_switch_add(ds); -} - -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) -{ - struct dsa_switch *ds; - int i; - - ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL); - if (!ds) - return NULL; - - ds->dev = dev; - ds->num_ports = n; - - for (i = 0; i < ds->num_ports; ++i) { - ds->ports[i].index = i; - ds->ports[i].ds = ds; - } + dst = ds->dst; + dsa_tree_get(dst); + err = dsa_tree_setup(dst); + if (err) + dsa_tree_put(dst); - return ds; + return err; } -EXPORT_SYMBOL_GPL(dsa_switch_alloc); int dsa_register_switch(struct dsa_switch *ds) { @@ -883,9 +872,16 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - unsigned int index = ds->index; + struct dsa_port *dp, *next; - dsa_tree_remove_switch(dst, index); + dsa_tree_teardown(dst); + + list_for_each_entry_safe(dp, next, &dst->ports, list) { + list_del(&dp->list); + kfree(dp); + } + + dsa_tree_put(dst); } void dsa_unregister_switch(struct dsa_switch *ds) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12f8c7ee4dd8..2dd86d9bcda9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -104,25 +104,14 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; - struct dsa_switch *ds; - struct dsa_port *slave_port; + struct dsa_port *dp; - if (device < 0 || device >= DSA_MAX_SWITCHES) - return NULL; + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->index == device && dp->index == port && + dp->type == DSA_PORT_TYPE_USER) + return dp->slave; - ds = dst->ds[device]; - if (!ds) - return NULL; - - if (port < 0 || port >= ds->num_ports) - return NULL; - - slave_port = &ds->ports[port]; - - if (unlikely(slave_port->type != DSA_PORT_TYPE_USER)) - return NULL; - - return slave_port->slave; + return NULL; } /* port.c */ @@ -164,8 +153,8 @@ void dsa_port_link_unregister_of(struct dsa_port *dp); void dsa_port_phylink_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state); +void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state); void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state); diff --git a/net/dsa/port.c b/net/dsa/port.c index 9b54e5a76297..46ac9ba21987 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -429,19 +429,22 @@ void dsa_port_phylink_validate(struct phylink_config *config, } EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); -int dsa_port_phylink_mac_link_state(struct phylink_config *config, - struct phylink_link_state *state) +void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; + /* Only called for inband modes */ + if (!ds->ops->phylink_mac_link_state) { + state->link = 0; + return; + } - return ds->ops->phylink_mac_link_state(ds, dp->index, state); + if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0) + state->link = 0; } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state); void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, @@ -510,7 +513,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, - .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_config = dsa_port_phylink_mac_config, .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, @@ -561,7 +564,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) struct dsa_switch *ds = dp->ds; struct phy_device *phydev; int port = dp->index; - int mode; + phy_interface_t mode; int err; err = of_phy_register_fixed_link(dn); @@ -574,8 +577,8 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) phydev = of_phy_find_device(dn); - mode = of_get_phy_mode(dn); - if (mode < 0) + err = of_get_phy_mode(dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; phydev->interface = mode; @@ -593,10 +596,11 @@ static int dsa_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; struct device_node *port_dn = dp->dn; - int mode, err; + phy_interface_t mode; + int err; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + err = of_get_phy_mode(port_dn, &mode); + if (err) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = ds->dev; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 028e65f4b5ba..78ffc87dc25e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -789,6 +789,22 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, return phylink_ethtool_ksettings_set(dp->pl, cmd); } +static void dsa_slave_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + phylink_ethtool_get_pauseparam(dp->pl, pause); +} + +static int dsa_slave_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_set_pauseparam(dp->pl, pause); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) @@ -1192,6 +1208,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_pauseparam = dsa_slave_get_pauseparam, + .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, @@ -1295,11 +1313,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; + phy_interface_t mode; u32 phy_flags = 0; - int mode, ret; + int ret; - mode = of_get_phy_mode(port_dn); - if (mode < 0) + ret = of_get_phy_mode(port_dn, &mode); + if (ret) mode = PHY_INTERFACE_MODE_NA; dp->pl_config.dev = &slave_dev->dev; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 6a9607518823..df4abe897ed6 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, int i; for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; + struct dsa_port *dp = dsa_to_port(ds, i); if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; @@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (unset_vlan_filtering) { struct switchdev_trans trans = {0}; - err = dsa_port_vlan_filtering(&ds->ports[info->port], + err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), false, &trans); if (err && err != EOPNOTSUPP) return err; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 9e5a883a9f0c..2fb6c26294b5 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -31,15 +31,14 @@ * Must be transmitted as zero and ignored on receive. * * SWITCH_ID - VID[8:6]: - * Index of switch within DSA tree. Must be between 0 and - * DSA_MAX_SWITCHES - 1. + * Index of switch within DSA tree. Must be between 0 and 7. * * RSV - VID[5:4]: * To be used for further expansion of PORT or for other purposes. * Must be transmitted as zero and ignored on receive. * * PORT - VID[3:0]: - * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1. + * Index of switch port. Must be between 0 and 15. */ #define DSA_8021Q_DIR_SHIFT 10 @@ -103,7 +102,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) if (!dsa_is_user_port(ds, port)) return 0; - slave = ds->ports[port].slave; + slave = dsa_to_port(ds, port)->slave; err = br_vlan_get_pvid(slave, &pvid); if (!pvid || err < 0) @@ -118,7 +117,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) return err; } - return dsa_port_vid_add(&ds->ports[port], pvid, vinfo.flags); + return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); } /* If @enabled is true, installs @vid with @flags into the switch port's HW @@ -130,7 +129,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { - struct dsa_port *dp = &ds->ports[port]; + struct dsa_port *dp = dsa_to_port(ds, port); struct bridge_vlan_info vinfo; int err; @@ -342,13 +341,4 @@ struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); -static const struct dsa_device_ops dsa_8021q_netdev_ops = { - .name = "8021q", - .proto = DSA_TAG_PROTO_8021Q, - .overhead = VLAN_HLEN, -}; - MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_8021Q); - -module_dsa_tag_driver(dsa_8021q_netdev_ops); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c new file mode 100644 index 000000000000..8e3e7283d430 --- /dev/null +++ b/net/dsa/tag_ocelot.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2019 NXP Semiconductors + */ +#include <soc/mscc/ocelot.h> +#include <linux/packing.h> +#include "dsa_priv.h" + +/* The CPU injection header and the CPU extraction header can have 3 types of + * prefixes: long, short and no prefix. The format of the header itself is the + * same in all 3 cases. + * + * Extraction with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Extraction with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | extraction | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Extraction with no prefix: + * + * +------------+-------+ + * | extraction | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * + * Injection with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | any dmac | any smac | 8880 | 000a | injection | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Injection with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | injection | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Injection with no prefix: + * + * +------------+-------+ + * | injection | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * The injection header looks like this (network byte order, bit 127 + * is part of lowest address byte in memory, bit 0 is part of highest + * address byte): + * + * +------+------+------+------+------+------+------+------+ + * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | RSV | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | RSV | DEST | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | DEST | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| + * +------+------+------+------+------+------+------+------+ + * 39: 32 | TFRM_TIMER | RSV | + * +------+------+------+------+------+------+------+------+ + * 31: 24 | RSV | DP | POP_CNT | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + * + * And the extraction header looks like this: + * + * +------+------+------+------+------+------+------+------+ + * 127:120 | RSV | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | LLEN | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | LLEN | WLEN | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | WLEN | RSV | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | RSV | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | ACL_ID | + * +------+------+------+------+------+------+------+------+ + * 39: 32 | ACL_ID | RSV | SFLOW_ID | + * +------+------+------+------+------+------+------+------+ + * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + */ + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u64 bypass, dest, src, qos_class, rew_op; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u8 *injection; + + if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) { + netdev_err(netdev, "Cannot make room for tag.\n"); + return NULL; + } + + injection = skb_push(skb, OCELOT_TAG_LEN); + + memset(injection, 0, OCELOT_TAG_LEN); + + src = dsa_upstream_port(ds, port); + dest = BIT(port); + bypass = true; + qos_class = skb->priority; + + packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + rew_op = ocelot_port->ptp_cmd; + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + rew_op |= (ocelot_port->ts_id % 4) << 3; + ocelot_port->ts_id++; + } + + packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); + } + + return skb; +} + +static struct sk_buff *ocelot_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + u64 src_port, qos_class; + u8 *start = skb->data; + u8 *extraction; + + /* Revert skb->data by the amount consumed by the DSA master, + * so it points to the beginning of the frame. + */ + skb_push(skb, ETH_HLEN); + /* We don't care about the long prefix, it is just for easy entrance + * into the DSA master's RX filter. Discard it now by moving it into + * the headroom. + */ + skb_pull(skb, OCELOT_LONG_PREFIX_LEN); + /* And skb->data now points to the extraction frame header. + * Keep a pointer to it. + */ + extraction = skb->data; + /* Now the EFH is part of the headroom as well */ + skb_pull(skb, OCELOT_TAG_LEN); + /* Reset the pointer to the real MAC header */ + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + /* And move skb->data to the correct location again */ + skb_pull(skb, ETH_HLEN); + + /* Remove from inet csum the extraction header */ + skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN); + + packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); + packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); + + skb->dev = dsa_master_find_slave(netdev, 0, src_port); + if (!skb->dev) + /* The switch will reflect back some frames sent through + * sockets opened on the bare DSA master. These will come back + * with src_port equal to the index of the CPU port, for which + * there is no slave registered. So don't print any error + * message here (ignore and drop those frames). + */ + return NULL; + + skb->offload_fwd_mark = 1; + skb->priority = qos_class; + + return skb; +} + +static struct dsa_device_ops ocelot_netdev_ops = { + .name = "ocelot", + .proto = DSA_TAG_PROTO_OCELOT, + .xmit = ocelot_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); + +module_dsa_tag_driver(ocelot_netdev_ops); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 17374afee28f..9040fe55e0f5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index ffcfcef76291..7c5a1aa5adb4 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -236,21 +236,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, - nl802154_policy, NULL); - if (err) - goto out_unlock; - *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - genl_family_attrbuf(&nl802154_fam)); + info->attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -557,17 +550,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { - struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nl802154_fam.hdrsize, - tb, nl802154_fam.maxattr, - nl802154_policy, NULL); - - /* TODO check if we can handle error here, - * we have no backward compatibility - */ - if (ret) - return 0; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr **tb = info->attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); @@ -2203,7 +2187,8 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, @@ -2343,7 +2328,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_KEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, @@ -2369,7 +2355,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, @@ -2395,7 +2382,8 @@ static const struct genl_ops nl802154_ops[] = { /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, @@ -2420,7 +2408,8 @@ static const struct genl_ops nl802154_ops[] = { }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 03381f3e12ba..fc816b187170 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -180,8 +180,8 @@ config NET_IPIP config NET_IPGRE_DEMUX tristate "IP: GRE demultiplexer" help - This is helper module to demultiplex GRE packets on GRE version field criteria. - Required by ip_gre and pptp modules. + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. config NET_IP_TUNNEL tristate @@ -459,200 +459,200 @@ config TCP_CONG_BIC tristate "Binary Increase Congestion (BIC) control" default m ---help--- - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ config TCP_CONG_CUBIC tristate "CUBIC TCP" default y ---help--- - This is version 2.0 of BIC-TCP which uses a cubic growth function - among other techniques. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m ---help--- - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. config TCP_CONG_HTCP tristate "H-TCP" default m ---help--- - H-TCP is a send-side only modifications of the TCP Reno - protocol stack that optimizes the performance of TCP - congestion control for high speed network links. It uses a - modeswitch to change the alpha and beta parameters of TCP Reno - based on network conditions and in a way so as to be fair with - other Reno and H-TCP flows. + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. config TCP_CONG_HSTCP tristate "High Speed TCP" default n ---help--- - Sally Floyd's High Speed TCP (RFC 3649) congestion control. - A modification to TCP's congestion control mechanism for use - with large congestion windows. A table indicates how much to - increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" default n ---help--- - TCP-Hybla is a sender-side only change that eliminates penalization of - long-RTT, large-bandwidth connections, like when satellite legs are - involved, especially when sharing a common bottleneck with normal - terrestrial connections. + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, especially when sharing a common bottleneck with normal + terrestrial connections. config TCP_CONG_VEGAS tristate "TCP Vegas" default n ---help--- - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. config TCP_CONG_NV - tristate "TCP NV" - default n - ---help--- - TCP NV is a follow up to TCP Vegas. It has been modified to deal with - 10G networks, measurement noise introduced by LRO, GRO and interrupt - coalescence. In addition, it will decrease its cwnd multiplicatively - instead of linearly. + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. - Note that in general congestion avoidance (cwnd decreased when # packets - queued grows) cannot coexist with congestion control (cwnd decreased only - when there is packet loss) due to fairness issues. One scenario when they - can coexist safely is when the CA flows have RTTs << CC flows RTTs. + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. - For further details see http://www.brakmo.org/networking/tcp-nv/ + For further details see http://www.brakmo.org/networking/tcp-nv/ config TCP_CONG_SCALABLE tristate "Scalable TCP" default n ---help--- - Scalable TCP is a sender-side only change to TCP which uses a - MIMD congestion control algorithm which has some nice scaling - properties, though is known to have fairness issues. - See http://www.deneholme.net/tom/scalable/ + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www.deneholme.net/tom/scalable/ config TCP_CONG_LP tristate "TCP Low Priority" default n ---help--- - TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utilize only the excess network bandwidth as compared to the - ``fair share`` of bandwidth as targeted by TCP. - See http://www-ece.rice.edu/networks/TCP-LP/ + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utilize only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ config TCP_CONG_VENO tristate "TCP Veno" default n ---help--- - TCP Veno is a sender-side only enhancement of TCP to obtain better - throughput over wireless networks. TCP Veno makes use of state - distinguishing to circumvent the difficult judgment of the packet loss - type. TCP Veno cuts down less congestion window in response to random - loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" select TCP_CONG_VEGAS default n ---help--- - YeAH-TCP is a sender-side high-speed enabled TCP congestion control - algorithm, which uses a mixed loss/delay approach to compute the - congestion window. It's design goals target high efficiency, - internal, RTT and Reno fairness, resilience to link loss while - keeping network elements load as low as possible. + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. - For further details look here: - http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf config TCP_CONG_ILLINOIS tristate "TCP Illinois" default n ---help--- - TCP-Illinois is a sender-side modification of TCP Reno for - high speed long delay links. It uses round-trip-time to - adjust the alpha and beta parameters to achieve a higher average - throughput and maintain fairness. + TCP-Illinois is a sender-side modification of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. - For further details see: - http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html config TCP_CONG_DCTCP tristate "DataCenter TCP (DCTCP)" default n ---help--- - DCTCP leverages Explicit Congestion Notification (ECN) in the network to - provide multi-bit feedback to the end hosts. It is designed to provide: + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: - - High burst tolerance (incast due to partition/aggregate), - - Low latency (short flows, queries), - - High throughput (continuous data updates, large file transfers) with - commodity, shallow-buffered switches. + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. - All switches in the data center network running DCTCP must support - ECN marking and be configured for marking when reaching defined switch - buffer thresholds. The default ECN marking threshold heuristic for - DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets - (~100KB) at 10Gbps, but might need further careful tweaking. + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. - For further details see: - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf config TCP_CONG_CDG tristate "CAIA Delay-Gradient (CDG)" default n ---help--- - CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies - the TCP sender in order to: + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control. o Tolerate packet loss unrelated to congestion. - For further details see: - D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg config TCP_CONG_BBR tristate "BBR TCP" default n ---help--- - BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to - maximize network utilization and minimize queues. It builds an explicit - model of the the bottleneck delivery rate and path round-trip - propagation delay. It tolerates packet loss and delay unrelated to - congestion. It can operate over LAN, WAN, cellular, wifi, or cable - modem links. It can coexist with flows that use loss-based congestion - control, and can operate with shallow buffers, deep buffers, - bufferbloat, policers, or AQM schemes that do not provide a delay - signal. It requires the fq ("Fair Queue") pacing packet scheduler. + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. choice prompt "Default TCP congestion control" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70f92aaca411..53de8e00990e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 71c78d223dfd..577db1d50a24 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -70,11 +70,6 @@ fail: fib_free_table(main_table); return -ENOMEM; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return false; -} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -131,11 +126,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return net->ipv4.fib_has_custom_rules; -} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include <net/netns/ipv4.h> #include <net/ip_fib.h> -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, @@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ab2fb6bb37d..b9df9c09b84e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,11 +74,13 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -2015,10 +2017,12 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,39 +2036,53 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, + KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4298aae74e0e..18068ed42f25 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -249,10 +249,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -262,14 +263,14 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { credit--; rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -682,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 480d0b22db1a..3b9c7a2725a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) } } -static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index eb30fc1770de..e4c6e8b40490 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -716,7 +716,7 @@ static void reqsk_timer_handler(struct timer_list *t) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7dc79b973e6e..af154977904c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; @@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_timer.expires - jiffies; - if (tmo < 0) - tmo = 0; - inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index be778599bfed..ff327a62c9ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base, base->total / inet_peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - delta = (__u32)jiffies - p->dtime; + + /* The READ_ONCE() pairs with the WRITE_ONCE() + * in inet_putpeer() + */ + delta = (__u32)jiffies - READ_ONCE(p->dtime); + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } @@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - p->dtime = (__u32)jiffies; + /* The WRITE_ONCE() pairs with itself (we run lockless) + * and the READ_ONCE() in inet_peer_gc() + */ + WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 10636fb6093e..572b6307a2df 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + const struct iphdr *tnl_params; + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; @@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_pop_mac_header(skb); else skb_reset_mac_header(skb); - if (tunnel->collect_md) { + + tnl_params = &tunnel->parms.iph; + if (tunnel->collect_md || tnl_params->daddr == 0) { __be16 flags; __be64 tun_id; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c59a78a267c3..aa438c6758a7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -302,16 +302,31 @@ drop: return true; } +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && + ip_hdr(hint)->tos == iph->tos; +} + INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); struct rtable *rt; int err; + if (ip_can_use_hint(skb, iph, hint)) { + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, + dev, hint); + if (unlikely(err)) + goto drop_error; + } + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev); + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head) } } +static struct sk_buff *ip_extract_route_hint(const struct net *net, + struct sk_buff *skb, int rt_type) +{ + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + return NULL; + + return skb; +} + static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip_extract_route_hint(net, skb, + ((struct rtable *)dst)->rt_type); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); @@ -611,5 +638,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3d8baaaf7086..9d83cb320dcb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -422,7 +422,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); @@ -430,7 +430,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1452a97914a0..47f8b947eef1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -34,6 +34,9 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; @@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) - return NULL; - res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); + src = &md->u.tun_info; + res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; - src = &md->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, @@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; + ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), + src->options_len, 0); return res; } @@ -211,30 +215,243 @@ void ip_tunnel_get_stats64(struct net_device *dev, EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { + [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { + [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static const struct nla_policy +vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; +static const struct nla_policy +erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + +static int ip_tun_parse_opts_geneve(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + int data_len, err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, + geneve_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || + !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || + !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) + return -EINVAL; + + attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + if (info) { + struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(attr); + attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(attr); + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int ip_tun_parse_opts_vxlan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, + vxlan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) + return -EINVAL; + + if (info) { + struct vxlan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; + md->gbp = nla_get_u32(attr); + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + } + + return sizeof(struct vxlan_metadata); +} + +static int ip_tun_parse_opts_erspan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, + erspan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) + return -EINVAL; + + ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) + return -EINVAL; + } else if (ver == 2) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || + !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) + return -EINVAL; + } else { + return -EINVAL; + } + + if (info) { + struct erspan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + md->version = ver; + if (ver == 1) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(attr); + } else { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(attr); + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(attr)); + } + + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + } + + return sizeof(struct erspan_metadata); +} + +static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, opts_len = 0, type = 0; + struct nlattr *nla; + + if (!attr) + return 0; + + err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, + ip_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case LWTUNNEL_IP_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + type = TUNNEL_GENEVE_OPT; + break; + case LWTUNNEL_IP_OPTS_VXLAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case LWTUNNEL_IP_OPTS_ERSPAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; + break; + default: + return -EINVAL; + } + } + + return opts_len; +} + +static int ip_tun_get_optlen(struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, NULL, extack); +} + +static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, info, extack); +} + static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -242,6 +459,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { @@ -266,10 +489,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -285,6 +510,114 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) #endif } +static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct geneve_opt *opt; + struct nlattr *nest; + int offset = 0; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); + if (!nest) + return -ENOMEM; + + while (tun_info->options_len > offset) { + opt = ip_tunnel_info_opts(tun_info) + offset; + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + offset += sizeof(*opt) + opt->length * 4; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, nest); + return 0; +err: + nla_nest_cancel(skb, nest); + return -ENOMEM; +} + +static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nest; + int err = 0; + + if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + nest = nla_nest_start_noflag(skb, type); + if (!nest) + return -ENOMEM; + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + err = ip_tun_fill_encap_opts_geneve(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + err = ip_tun_fill_encap_opts_erspan(skb, tun_info); + + if (err) { + nla_nest_cancel(skb, nest); + return err; + } + + nla_nest_end(skb, nest); + return 0; +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -296,12 +629,52 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } +static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) +{ + int opt_len; + + if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt; + int offset = 0; + + opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ + while (info->options_len > offset) { + opt = ip_tunnel_info_opts(info) + offset; + opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + offset += sizeof(*opt) + opt->length * 4; + } + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + + nla_total_size(4); /* OPT_VXLAN_GBP */ + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + + nla_total_size(1) /* OPT_ERSPAN_VER */ + + (md->version == 1 ? nla_total_size(4) + /* OPT_ERSPAN_INDEX (v1) */ + : nla_total_size(1) + + nla_total_size(1)); + /* OPT_ERSPAN_DIR + HWID (v2) */ + } + + return opt_len; +} + static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ @@ -309,13 +682,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { - return memcmp(lwt_tun_info(a), lwt_tun_info(b), - sizeof(struct ip_tunnel_info)); + struct ip_tunnel_info *info_a = lwt_tun_info(a); + struct ip_tunnel_info *info_b = lwt_tun_info(b); + + return memcmp(info_a, info_b, sizeof(info_a->key)) || + info_a->mode != info_b->mode || + info_a->options_len != info_b->options_len || + memcmp(ip_tunnel_info_opts(info_a), + ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { @@ -328,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct nlattr *attr, @@ -341,17 +724,21 @@ static int ip6_tun_build_state(struct nlattr *attr, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -359,6 +746,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); @@ -375,10 +768,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -396,7 +791,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; @@ -409,7 +805,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9bcca08efec9..f35308ff84c3 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void) struct net_device *dev; int found = 0; + /* make sure deferred device probes are finished */ + wait_for_device_probe(); + rtnl_lock(); for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { @@ -1483,10 +1486,10 @@ static int __init ip_auto_config(void) * missing values. */ if (ic_myaddr == NONE || -#ifdef CONFIG_ROOT_NFS +#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT) (root_server_addr == NONE && ic_servaddr == NONE && - ROOT_DEV == Root_NFS) || + (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1513,6 +1516,12 @@ static int __init ip_auto_config(void) goto try_try_again; } #endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + pr_err("IP-Config: Retrying forever (CIFS root)...\n"); + goto try_try_again; + } +#endif if (--retries) { pr_err("IP-Config: Reopening network devices...\n"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 58007439cffd..6e68def66822 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -3041,10 +3043,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id, extack); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id, extack); + if (err) + return err; + } } return 0; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 012c4047c788..e32e41b99f0f 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -9,6 +9,8 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv4, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..c94445b44d8c 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, if (icmph == NULL) return 1; - switch (icmph->type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - break; - default: + if (!icmp_is_err(icmph->type)) return 1; - } inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fc34fd1668d6..511eaa94e2d1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { - [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 621f83434b24..f88c93c38f11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1894,10 +1894,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, @@ -2022,10 +2019,52 @@ static int ip_mkroute_input(struct sk_buff *skb, return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } +/* Implements all the saddr-related checks as ip_route_input_slow(), + * assuming daddr is valid and the destination is not a local broadcast one. + * Uses the provided hint instead of performing a route lookup. + */ +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + const struct sk_buff *hint) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt = (struct rtable *)hint; + struct net *net = dev_net(dev); + int err = -EINVAL; + u32 tag = 0; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + + if (rt->rt_type != RTN_LOCAL) + goto skip_validate_source; + + tos &= IPTOS_RT_MASK; + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + if (err < 0) + goto martian_source; + +skip_validate_source: + skb_dst_copy(skb, hint); + return 0; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + return err; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. + * Changes in the enforced policies must be applied also to + * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 535b69326f66..345b2b0ff618 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + u32 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; ireq = inet_rsk(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0902cb32bbad..fcb2cd167f64 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); + + if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) + break; + if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d8876f0e9672..9b48aec29aca 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1741,8 +1741,8 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; + u32 length = 0, seq, offset, zap_len; const skb_frag_t *frags = NULL; - u32 length = 0, seq, offset; struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; @@ -1769,12 +1769,12 @@ static int tcp_zerocopy_receive(struct sock *sk, seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); - zc->length &= ~(PAGE_SIZE - 1); - if (zc->length) { - zap_page_range(vma, address, zc->length); + zap_len = zc->length & ~(PAGE_SIZE - 1); + if (zap_len) { + zap_page_range(vma, address, zap_len); zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = inq; + zc->recv_skip_hint = zc->length; } ret = 0; while (length + PAGE_SIZE <= zc->length) { @@ -1958,8 +1958,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping_internal tss; - bool has_tss = false; - bool has_cmsg; + int cmsg_flags; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1974,7 +1973,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; - has_cmsg = tp->recvmsg_inq; + cmsg_flags = tp->recvmsg_inq ? 1 : 0; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2047,7 +2046,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -2157,8 +2156,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); - has_tss = true; - has_cmsg = true; + cmsg_flags |= 2; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -2183,10 +2181,10 @@ found_fin_ok: release_sock(sk); - if (has_cmsg) { - if (has_tss) + if (cmsg_flags) { + if (cmsg_flags & 2) tcp_recv_timestamp(msg, sk, &tss); - if (tp->recvmsg_inq) { + if (cmsg_flags & 1) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -2666,6 +2664,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -3224,8 +3223,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } @@ -3305,6 +3304,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c445a81d144e..3737ec096650 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 549506162dde..0d08f9e2d8d0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,8 +21,8 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index a915ade0c818..19ad9586c720 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return true; } - return cookie->len > 0; + if (cookie->len > 0) + return true; + tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; + return false; } /* This function checks if we want to defer sending SYN until the first diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a2e52ad7cdab..88b987ca9ebb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5814,6 +5814,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ + if (tp->total_retrans) + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; + else + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67b2dc7a1727..92282f98dc82 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && - (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && - (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif @@ -2453,7 +2451,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) state = inet_sk_state_load(sk); if (state == TCP_LISTEN) - rx_queue = sk->sk_ack_backlog; + rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0488607c5cd3..be6d22b8190f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3290,7 +3290,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 4849edb62d52..12ab5db2b71c 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -92,6 +92,9 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 447defbfccdd..4da5758cc718 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2551,9 +2551,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_ENCAP: switch (val) { case 0: +#ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; +#endif /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ecff3fce9807..89ba7c87de5d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 34ccef18b40e..98d82305d6de 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5552,14 +5552,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) goto nla_put_failure; - - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) - goto nla_put_failure; - read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 05f82baaa99e..f87ae33e1d01 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -7,12 +7,12 @@ #include <net/netns/ipv6.h> #include <net/ip6_fib.h> -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, @@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net) return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } -static int fib6_dump(struct net *net, struct notifier_block *nb) +static int fib6_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib6_rules_dump(net, nb); + err = fib6_rules_dump(net, nb, extack); if (err) return err; - return fib6_tables_dump(net, nb); + return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f9e8fe3ff0c5..fafe556d21e0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib6_rule_default); -int fib6_rules_dump(struct net *net, struct notifier_block *nb) +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET6); + return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 62c997201970..ef408a5090a2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -516,13 +516,29 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, mip6_addr_swap(skb); + sk = icmpv6_xmit_lock(net); + if (!sk) + goto out_bh_enable; + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; - if (saddr) + if (saddr) { fl6.saddr = *saddr; + } else { + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + + in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, + &fl6.saddr); + dev_put(in_netdev); + } + } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; @@ -531,10 +547,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - sk = icmpv6_xmit_lock(net); - if (!sk) - goto out_bh_enable; - sk->sk_mark = mark; np = inet6_sk(sk); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e2af411cd9c..7bae6a91b487 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -357,15 +357,17 @@ unsigned int fib6_tables_seq_read(struct net *net) return fib_seq; } -static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib6_info *rt) + struct fib6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; - return call_fib6_notifier(nb, net, event_type, &info.info); + return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, @@ -401,40 +403,51 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_dump_arg { struct net *net; struct notifier_block *nb; + struct netlink_ext_ack *extack; }; -static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) - return; - call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); + return 0; + return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, + rt, arg->extack); } static int fib6_node_dump(struct fib6_walker *w) { struct fib6_info *rt; + int err = 0; - for_each_fib6_walker_rt(w) - fib6_rt_dump(rt, w->args); + for_each_fib6_walker_rt(w) { + err = fib6_rt_dump(rt, w->args); + if (err) + break; + } w->leaf = NULL; - return 0; + return err; } -static void fib6_table_dump(struct net *net, struct fib6_table *tb, - struct fib6_walker *w) +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) { + int err; + w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); - fib6_walk(net, w); + err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); + return err; } /* Called with rcu_read_lock() */ -int fib6_tables_dump(struct net *net, struct notifier_block *nb) +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; + int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) @@ -443,19 +456,24 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) w->func = fib6_node_dump; arg.net = net; arg.nb = nb; + arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib6_table_dump(net, tb, w); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err < 0) + goto out; + } } +out: kfree(w); - return 0; + return err; } static int fib6_dump_node(struct fib6_walker *w) @@ -1443,6 +1461,8 @@ out: } #endif goto failure; + } else if (fib6_requires_src(rt)) { + fib6_routes_require_src_inc(info->nl_net); } return err; @@ -1915,6 +1935,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info) struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { + if (fib6_requires_src(cur)) + fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 3d71c7d6102c..7b089d0ac8cd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -86,11 +86,27 @@ static void ip6_sublist_rcv_finish(struct list_head *head) } } +static bool ip6_can_use_hint(const struct sk_buff *skb, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && + ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr); +} + +static struct sk_buff *ip6_extract_route_hint(const struct net *net, + struct sk_buff *skb) +{ + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) + return NULL; + + return skb; +} + static void ip6_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -104,9 +120,15 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip6_rcv(skb); if (!skb) continue; - ip6_rcv_finish_core(net, sk, skb); + + if (ip6_can_use_hint(skb, hint)) + skb_dst_copy(skb, hint); + else + ip6_rcv_finish_core(net, sk, skb); dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip6_extract_route_hint(net, skb); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv_finish(&sublist); @@ -325,7 +347,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip6_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 71827b56c006..945508a7cb0f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -160,7 +160,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); @@ -173,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 857a89ad4d6c..bfa49ff70531 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) @@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net) return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } -static int ip6mr_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock); + ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 69443e9a3aa5..0594131fa46d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -128,9 +128,9 @@ config IP6_NF_MATCH_HL depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_HL ---help--- - This is a backwards-compat option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_MATCH_HL. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' @@ -184,9 +184,9 @@ config IP6_NF_TARGET_HL depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compatible option for the user's convenience - (e.g. when running oldconfig). It selects - CONFIG_NETFILTER_XT_TARGET_HL. + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. config IP6_NF_FILTER tristate "Packet filtering" @@ -245,14 +245,14 @@ config IP6_NF_RAW # security table for MAC policy config IP6_NF_SECURITY - tristate "Security table" - depends on SECURITY - depends on NETFILTER_ADVANCED - help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. - - If unsure, say N. + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index f6d9a48c7a2a..a8566ee12e83 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -10,6 +10,8 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv6, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 34d51cd426b0..6bac68fb27a3 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -150,4 +150,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); -MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); +MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support"); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3f83ea851ebf..b59940416cb5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1479,11 +1479,11 @@ static u32 rt6_exception_hash(const struct in6_addr *dst, u32 val; net_get_random_once(&seed, sizeof(seed)); - val = jhash(dst, sizeof(*dst), seed); + val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash(src, sizeof(*src), val); + val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val); #endif return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } @@ -2295,10 +2295,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && - icmph->icmp6_type != ICMPV6_PKT_TOOBIG && - icmph->icmp6_type != ICMPV6_TIME_EXCEED && - icmph->icmp6_type != ICMPV6_PARAMPROB) + if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, @@ -6202,6 +6199,9 @@ static int __net_init ip6_route_net_init(struct net *net) dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); +#ifdef CONFIG_IPV6_SUBTREES + net->ipv6.fib6_routes_require_src = 0; +#endif #endif net->ipv6.sysctl.flush_delay = 0; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index e70567446f28..85a5447a3e8d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -149,8 +149,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +static int +seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id, bool local_delivery) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -158,6 +159,7 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, struct dst_entry *dst = NULL; struct rt6_info *rt; struct flowi6 fl6; + int dev_flags = 0; fl6.flowi6_iif = skb->dev->ifindex; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; @@ -182,7 +184,13 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, dst = &rt->dst; } - if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) { + /* we want to discard traffic destined for local packet processing, + * if @local_delivery is set to false. + */ + if (!local_delivery) + dev_flags |= IFF_LOOPBACK; + + if (dst && (dst->dev->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } @@ -199,6 +207,12 @@ out: return dst->error; } +int seg6_lookup_nexthop(struct sk_buff *skb, + struct in6_addr *nhaddr, u32 tbl_id) +{ + return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); +} + /* regular endpoint function */ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -396,7 +410,7 @@ static int input_action_end_dt6(struct sk_buff *skb, skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - seg6_lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); return dst_input(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4804b6dc5e65..81f51335e326 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1891,7 +1891,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) state = inet_sk_state_load(sp); if (state == TCP_LISTEN) - rx_queue = sp->sk_ack_backlog; + rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index eecac1b7148e..fbe51d40bd7e 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -187,7 +187,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c74f44dfaa22..2922d4150d88 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -705,7 +705,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: @@ -780,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 4f03ebe732fa..6cbb1286d6c0 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -32,7 +32,8 @@ mac80211-y := \ chan.o \ trace.o mlme.o \ tdls.o \ - ocb.o + ocb.o \ + airtime.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b11883d26875..33da6f738c99 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret) { + if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + /* + * We didn't send the request yet, so don't need to check + * here if we already got a response, just mark as driver + * ready immediately. + */ + set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state); + } else if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c new file mode 100644 index 000000000000..63cb0028b02d --- /dev/null +++ b/net/mac80211/airtime.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: ISC +/* + * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name> + */ + +#include <net/mac80211.h> +#include "ieee80211_i.h" +#include "sta_info.h" + +#define AVG_PKT_SIZE 1024 + +/* Number of bits for an average sized packet */ +#define MCS_NBITS (AVG_PKT_SIZE << 3) + +/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per + * symbol. We use k-symbols to avoid rounding in the _TIME macros below. + */ +#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps)) + +/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024 + * symbols. + */ +#define MCS_SYMBOL_TIME(sgi, ksyms) \ + (sgi ? \ + ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \ + ((ksyms) * 4) /* 4.0 us per sym */ \ + ) + +/* Transmit duration for the raw data part of an average sized packet */ +#define MCS_DURATION(streams, sgi, bps) \ + ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps)))) + +#define MCS_DURATION_S(shift, streams, sgi, bps) \ + ((u16)((MCS_DURATION(streams, sgi, bps) >> shift))) + +/* These should match the values in enum nl80211_he_gi */ +#define HE_GI_08 0 +#define HE_GI_16 1 +#define HE_GI_32 2 + +/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */ +#define HE_SYMBOL_TIME(gi, ksyms) \ + (gi == HE_GI_08 ? \ + ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \ + (gi == HE_GI_16 ? \ + ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \ + ((ksyms) * 16) /* 16.0 us per sym */ \ + )) + +/* Transmit duration for the raw data part of an average sized packet */ +#define HE_DURATION(streams, gi, bps) \ + ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps)))) + +#define HE_DURATION_S(shift, streams, gi, bps) \ + (HE_DURATION(streams, gi, bps) >> shift) + +#define BW_20 0 +#define BW_40 1 +#define BW_80 2 +#define BW_160 3 + +/* + * Define group sort order: HT40 -> SGI -> #streams + */ +#define IEEE80211_MAX_STREAMS 4 +#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ +#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */ + +#define IEEE80211_HE_MAX_STREAMS 8 +#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */ + +#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_HT_STREAM_GROUPS) +#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ + IEEE80211_VHT_STREAM_GROUPS) +#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \ + IEEE80211_HE_STREAM_GROUPS) +#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \ + IEEE80211_VHT_GROUPS_NB + \ + IEEE80211_HE_GROUPS_NB) + +#define IEEE80211_HT_GROUP_0 0 +#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB) +#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB) + +#define MCS_GROUP_RATES 12 + +#define HT_GROUP_IDX(_streams, _sgi, _ht40) \ + IEEE80211_HT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * _ht40 + \ + IEEE80211_MAX_STREAMS * _sgi + \ + _streams - 1 + +#define _MAX(a, b) (((a)>(b))?(a):(b)) + +#define GROUP_SHIFT(duration) \ + _MAX(0, 16 - __builtin_clz(duration)) + +/* MCS rate information for an MCS group */ +#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ + [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \ + MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \ + } \ +} + +#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) + +#define MCS_GROUP(_streams, _sgi, _ht40) \ + __MCS_GROUP(_streams, _sgi, _ht40, \ + MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) + +#define VHT_GROUP_IDX(_streams, _sgi, _bw) \ + (IEEE80211_VHT_GROUP_0 + \ + IEEE80211_MAX_STREAMS * 2 * (_bw) + \ + IEEE80211_MAX_STREAMS * (_sgi) + \ + (_streams) - 1) + +#define BW2VBPS(_bw, r4, r3, r2, r1) \ + (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) + +#define __VHT_GROUP(_streams, _sgi, _bw, _s) \ + [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 234, 117, 54, 26)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 468, 234, 108, 52)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 702, 351, 162, 78)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 936, 468, 216, 104)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1404, 702, 324, 156)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 1872, 936, 432, 208)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2106, 1053, 486, 234)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2340, 1170, 540, 260)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 2808, 1404, 648, 312)), \ + MCS_DURATION_S(_s, _streams, _sgi, \ + BW2VBPS(_bw, 3120, 1560, 720, 346)) \ + } \ +} + +#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ + BW2VBPS(_bw, 243, 117, 54, 26))) + +#define VHT_GROUP(_streams, _sgi, _bw) \ + __VHT_GROUP(_streams, _sgi, _bw, \ + VHT_GROUP_SHIFT(_streams, _sgi, _bw)) + + +#define HE_GROUP_IDX(_streams, _gi, _bw) \ + (IEEE80211_HE_GROUP_0 + \ + IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \ + IEEE80211_HE_MAX_STREAMS * (_gi) + \ + (_streams) - 1) + +#define __HE_GROUP(_streams, _gi, _bw, _s) \ + [HE_GROUP_IDX(_streams, _gi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 1958, 979, 475, 230)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 2937, 1468, 705, 345)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 3916, 1958, 936, 475)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 5875, 2937, 1411, 705)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 7833, 3916, 1872, 936)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \ + HE_DURATION_S(_s, _streams, _gi, \ + BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \ + } \ +} + +#define HE_GROUP_SHIFT(_streams, _gi, _bw) \ + GROUP_SHIFT(HE_DURATION(_streams, _gi, \ + BW2VBPS(_bw, 979, 489, 230, 115))) + +#define HE_GROUP(_streams, _gi, _bw) \ + __HE_GROUP(_streams, _gi, _bw, \ + HE_GROUP_SHIFT(_streams, _gi, _bw)) +struct mcs_group { + u8 shift; + u16 duration[MCS_GROUP_RATES]; +}; + +static const struct mcs_group airtime_mcs_groups[] = { + MCS_GROUP(1, 0, BW_20), + MCS_GROUP(2, 0, BW_20), + MCS_GROUP(3, 0, BW_20), + MCS_GROUP(4, 0, BW_20), + + MCS_GROUP(1, 1, BW_20), + MCS_GROUP(2, 1, BW_20), + MCS_GROUP(3, 1, BW_20), + MCS_GROUP(4, 1, BW_20), + + MCS_GROUP(1, 0, BW_40), + MCS_GROUP(2, 0, BW_40), + MCS_GROUP(3, 0, BW_40), + MCS_GROUP(4, 0, BW_40), + + MCS_GROUP(1, 1, BW_40), + MCS_GROUP(2, 1, BW_40), + MCS_GROUP(3, 1, BW_40), + MCS_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_20), + VHT_GROUP(2, 0, BW_20), + VHT_GROUP(3, 0, BW_20), + VHT_GROUP(4, 0, BW_20), + + VHT_GROUP(1, 1, BW_20), + VHT_GROUP(2, 1, BW_20), + VHT_GROUP(3, 1, BW_20), + VHT_GROUP(4, 1, BW_20), + + VHT_GROUP(1, 0, BW_40), + VHT_GROUP(2, 0, BW_40), + VHT_GROUP(3, 0, BW_40), + VHT_GROUP(4, 0, BW_40), + + VHT_GROUP(1, 1, BW_40), + VHT_GROUP(2, 1, BW_40), + VHT_GROUP(3, 1, BW_40), + VHT_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_80), + VHT_GROUP(2, 0, BW_80), + VHT_GROUP(3, 0, BW_80), + VHT_GROUP(4, 0, BW_80), + + VHT_GROUP(1, 1, BW_80), + VHT_GROUP(2, 1, BW_80), + VHT_GROUP(3, 1, BW_80), + VHT_GROUP(4, 1, BW_80), + + VHT_GROUP(1, 0, BW_160), + VHT_GROUP(2, 0, BW_160), + VHT_GROUP(3, 0, BW_160), + VHT_GROUP(4, 0, BW_160), + + VHT_GROUP(1, 1, BW_160), + VHT_GROUP(2, 1, BW_160), + VHT_GROUP(3, 1, BW_160), + VHT_GROUP(4, 1, BW_160), + + HE_GROUP(1, HE_GI_08, BW_20), + HE_GROUP(2, HE_GI_08, BW_20), + HE_GROUP(3, HE_GI_08, BW_20), + HE_GROUP(4, HE_GI_08, BW_20), + HE_GROUP(5, HE_GI_08, BW_20), + HE_GROUP(6, HE_GI_08, BW_20), + HE_GROUP(7, HE_GI_08, BW_20), + HE_GROUP(8, HE_GI_08, BW_20), + + HE_GROUP(1, HE_GI_16, BW_20), + HE_GROUP(2, HE_GI_16, BW_20), + HE_GROUP(3, HE_GI_16, BW_20), + HE_GROUP(4, HE_GI_16, BW_20), + HE_GROUP(5, HE_GI_16, BW_20), + HE_GROUP(6, HE_GI_16, BW_20), + HE_GROUP(7, HE_GI_16, BW_20), + HE_GROUP(8, HE_GI_16, BW_20), + + HE_GROUP(1, HE_GI_32, BW_20), + HE_GROUP(2, HE_GI_32, BW_20), + HE_GROUP(3, HE_GI_32, BW_20), + HE_GROUP(4, HE_GI_32, BW_20), + HE_GROUP(5, HE_GI_32, BW_20), + HE_GROUP(6, HE_GI_32, BW_20), + HE_GROUP(7, HE_GI_32, BW_20), + HE_GROUP(8, HE_GI_32, BW_20), + + HE_GROUP(1, HE_GI_08, BW_40), + HE_GROUP(2, HE_GI_08, BW_40), + HE_GROUP(3, HE_GI_08, BW_40), + HE_GROUP(4, HE_GI_08, BW_40), + HE_GROUP(5, HE_GI_08, BW_40), + HE_GROUP(6, HE_GI_08, BW_40), + HE_GROUP(7, HE_GI_08, BW_40), + HE_GROUP(8, HE_GI_08, BW_40), + + HE_GROUP(1, HE_GI_16, BW_40), + HE_GROUP(2, HE_GI_16, BW_40), + HE_GROUP(3, HE_GI_16, BW_40), + HE_GROUP(4, HE_GI_16, BW_40), + HE_GROUP(5, HE_GI_16, BW_40), + HE_GROUP(6, HE_GI_16, BW_40), + HE_GROUP(7, HE_GI_16, BW_40), + HE_GROUP(8, HE_GI_16, BW_40), + + HE_GROUP(1, HE_GI_32, BW_40), + HE_GROUP(2, HE_GI_32, BW_40), + HE_GROUP(3, HE_GI_32, BW_40), + HE_GROUP(4, HE_GI_32, BW_40), + HE_GROUP(5, HE_GI_32, BW_40), + HE_GROUP(6, HE_GI_32, BW_40), + HE_GROUP(7, HE_GI_32, BW_40), + HE_GROUP(8, HE_GI_32, BW_40), + + HE_GROUP(1, HE_GI_08, BW_80), + HE_GROUP(2, HE_GI_08, BW_80), + HE_GROUP(3, HE_GI_08, BW_80), + HE_GROUP(4, HE_GI_08, BW_80), + HE_GROUP(5, HE_GI_08, BW_80), + HE_GROUP(6, HE_GI_08, BW_80), + HE_GROUP(7, HE_GI_08, BW_80), + HE_GROUP(8, HE_GI_08, BW_80), + + HE_GROUP(1, HE_GI_16, BW_80), + HE_GROUP(2, HE_GI_16, BW_80), + HE_GROUP(3, HE_GI_16, BW_80), + HE_GROUP(4, HE_GI_16, BW_80), + HE_GROUP(5, HE_GI_16, BW_80), + HE_GROUP(6, HE_GI_16, BW_80), + HE_GROUP(7, HE_GI_16, BW_80), + HE_GROUP(8, HE_GI_16, BW_80), + + HE_GROUP(1, HE_GI_32, BW_80), + HE_GROUP(2, HE_GI_32, BW_80), + HE_GROUP(3, HE_GI_32, BW_80), + HE_GROUP(4, HE_GI_32, BW_80), + HE_GROUP(5, HE_GI_32, BW_80), + HE_GROUP(6, HE_GI_32, BW_80), + HE_GROUP(7, HE_GI_32, BW_80), + HE_GROUP(8, HE_GI_32, BW_80), + + HE_GROUP(1, HE_GI_08, BW_160), + HE_GROUP(2, HE_GI_08, BW_160), + HE_GROUP(3, HE_GI_08, BW_160), + HE_GROUP(4, HE_GI_08, BW_160), + HE_GROUP(5, HE_GI_08, BW_160), + HE_GROUP(6, HE_GI_08, BW_160), + HE_GROUP(7, HE_GI_08, BW_160), + HE_GROUP(8, HE_GI_08, BW_160), + + HE_GROUP(1, HE_GI_16, BW_160), + HE_GROUP(2, HE_GI_16, BW_160), + HE_GROUP(3, HE_GI_16, BW_160), + HE_GROUP(4, HE_GI_16, BW_160), + HE_GROUP(5, HE_GI_16, BW_160), + HE_GROUP(6, HE_GI_16, BW_160), + HE_GROUP(7, HE_GI_16, BW_160), + HE_GROUP(8, HE_GI_16, BW_160), + + HE_GROUP(1, HE_GI_32, BW_160), + HE_GROUP(2, HE_GI_32, BW_160), + HE_GROUP(3, HE_GI_32, BW_160), + HE_GROUP(4, HE_GI_32, BW_160), + HE_GROUP(5, HE_GI_32, BW_160), + HE_GROUP(6, HE_GI_32, BW_160), + HE_GROUP(7, HE_GI_32, BW_160), + HE_GROUP(8, HE_GI_32, BW_160), +}; + +static u32 +ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre, + bool cck, int len) +{ + u32 duration; + + if (cck) { + duration = 144 + 48; /* preamble + PLCP */ + if (short_pre) + duration >>= 1; + + duration += 10; /* SIFS */ + } else { + duration = 20 + 16; /* premable + SIFS */ + } + + len <<= 3; + duration += (len * 10) / bitrate; + + return duration; +} + +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_rate *rate; + bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI; + bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE; + int bw, streams; + int group, idx; + u32 duration; + bool cck; + + switch (status->bw) { + case RATE_INFO_BW_20: + bw = BW_20; + break; + case RATE_INFO_BW_40: + bw = BW_40; + break; + case RATE_INFO_BW_80: + bw = BW_80; + break; + case RATE_INFO_BW_160: + bw = BW_160; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + switch (status->encoding) { + case RX_ENC_LEGACY: + if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ)) + return 0; + + sband = hw->wiphy->bands[status->band]; + if (!sband || status->rate_idx > sband->n_bitrates) + return 0; + + rate = &sband->bitrates[status->rate_idx]; + cck = rate->flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp, + cck, len); + + case RX_ENC_VHT: + streams = status->nss; + idx = status->rate_idx; + group = VHT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HT: + streams = ((status->rate_idx >> 3) & 3) + 1; + idx = status->rate_idx & 7; + group = HT_GROUP_IDX(streams, sgi, bw); + break; + case RX_ENC_HE: + streams = status->nss; + idx = status->rate_idx; + group = HE_GROUP_IDX(streams, status->he_gi, bw); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) || + (status->encoding == RX_ENC_HE && streams > 8))) + return 0; + + duration = airtime_mcs_groups[group].duration[idx]; + duration <<= airtime_mcs_groups[group].shift; + duration *= len; + duration /= AVG_PKT_SIZE; + duration /= 1024; + + duration += 36 + (streams << 2); + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime); + +static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, + struct ieee80211_tx_rate *rate, + u8 band, int len) +{ + struct ieee80211_rx_status stat = { + .band = band, + }; + + if (rate->idx < 0 || !rate->count) + return 0; + + if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_80; + else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + stat.bw = RATE_INFO_BW_40; + else + stat.bw = RATE_INFO_BW_20; + + stat.enc_flags = 0; + if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) + stat.enc_flags |= RX_ENC_FLAG_SHORTPRE; + if (rate->flags & IEEE80211_TX_RC_SHORT_GI) + stat.enc_flags |= RX_ENC_FLAG_SHORT_GI; + + stat.rate_idx = rate->idx; + if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { + stat.encoding = RX_ENC_VHT; + stat.rate_idx = ieee80211_rate_get_vht_mcs(rate); + stat.nss = ieee80211_rate_get_vht_nss(rate); + } else if (rate->flags & IEEE80211_TX_RC_MCS) { + stat.encoding = RX_ENC_HT; + } else { + stat.encoding = RX_ENC_LEGACY; + } + + return ieee80211_calc_rx_airtime(hw, &stat, len); +} + +u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_tx_info *info, + int len) +{ + u32 duration = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) { + struct ieee80211_tx_rate *rate = &info->status.rates[i]; + u32 cur_duration; + + cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, + info->band, len); + if (!cur_duration) + break; + + duration += cur_duration * rate->count; + } + + return duration; +} +EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime); + +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_chanctx_conf *conf; + int rateidx, shift = 0; + bool cck, short_pream; + u32 basic_rates; + u8 band = 0; + u16 rate; + + len += 38; /* Ethernet header length */ + + conf = rcu_dereference(vif->chanctx_conf); + if (conf) { + band = conf->def.chan->band; + shift = ieee80211_chandef_get_shift(&conf->def); + } + + if (pubsta) { + struct sta_info *sta = container_of(pubsta, struct sta_info, + sta); + + return ieee80211_calc_tx_airtime_rate(hw, + &sta->tx_stats.last_rate, + band, len); + } + + if (!conf) + return 0; + + /* No station to get latest rate from, so calculate the worst-case + * duration using the lowest configured basic rate. + */ + sband = hw->wiphy->bands[band]; + + basic_rates = vif->bss_conf.basic_rates; + short_pream = vif->bss_conf.use_short_preamble; + + rateidx = basic_rates ? ffs(basic_rates) - 1 : 0; + rate = sband->bitrates[rateidx].bitrate << shift; + cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len); +} diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 70739e746c13..4fb7f1f12109 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3428,7 +3428,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); + 1, 0x40, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 568b3b276931..ad41d74530c6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -59,6 +59,8 @@ static const struct file_operations name## _ops = { \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); +DEBUGFS_READONLY_FILE(hw_conf, "%x", + local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", @@ -148,6 +150,87 @@ static const struct file_operations aqm_ops = { .llseek = default_llseek, }; +static ssize_t aql_txq_limit_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[400]; + int len = 0; + + len = scnprintf(buf, sizeof(buf), + "AC AQL limit low AQL limit high\n" + "VO %u %u\n" + "VI %u %u\n" + "BE %u %u\n" + "BK %u %u\n", + local->aql_txq_limit_low[IEEE80211_AC_VO], + local->aql_txq_limit_high[IEEE80211_AC_VO], + local->aql_txq_limit_low[IEEE80211_AC_VI], + local->aql_txq_limit_high[IEEE80211_AC_VI], + local->aql_txq_limit_low[IEEE80211_AC_BE], + local->aql_txq_limit_high[IEEE80211_AC_BE], + local->aql_txq_limit_low[IEEE80211_AC_BK], + local->aql_txq_limit_high[IEEE80211_AC_BK]); + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static ssize_t aql_txq_limit_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[100]; + size_t len; + u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; + struct sta_info *sta; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = 0; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = 0; + + if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + q_limit_low_old = local->aql_txq_limit_low[ac]; + q_limit_high_old = local->aql_txq_limit_high[ac]; + + local->aql_txq_limit_low[ac] = q_limit_low; + local->aql_txq_limit_high[ac] = q_limit_high; + + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + /* If a sta has customized queue limits, keep it */ + if (sta->airtime[ac].aql_limit_low == q_limit_low_old && + sta->airtime[ac].aql_limit_high == q_limit_high_old) { + sta->airtime[ac].aql_limit_low = q_limit_low; + sta->airtime[ac].aql_limit_high = q_limit_high; + } + } + mutex_unlock(&local->sta_mtx); + return count; +} + +static const struct file_operations aql_txq_limit_ops = { + .write = aql_txq_limit_write, + .read = aql_txq_limit_read, + .open = simple_open, + .llseek = default_llseek, +}; + static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, @@ -433,6 +516,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); if (local->ops->wake_tx_queue) @@ -441,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local) debugfs_create_u16("airtime_flags", 0600, phyd, &local->airtime_flags); + DEBUGFS_ADD(aql_txq_limit); + debugfs_create_u32("aql_threshold", 0600, + phyd, &local->aql_threshold); + statsd = debugfs_create_dir("statistics", phyd); /* if the dir failed, don't put all the other things into the root! */ diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c8ad20c28c43..0185e6e5e5d1 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -197,10 +197,12 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; - size_t bufsz = 200; + size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s64 deficit[IEEE80211_NUM_ACS]; + u32 q_depth[IEEE80211_NUM_ACS]; + u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; ssize_t rv; int ac; @@ -212,19 +214,22 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; deficit[ac] = sta->airtime[ac].deficit; + q_limit_l[ac] = sta->airtime[ac].aql_limit_low; + q_limit_h[ac] = sta->airtime[ac].aql_limit_high; spin_unlock_bh(&local->active_txq_lock[ac]); + q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" - "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", - rx_airtime, - tx_airtime, - sta->airtime_weight, - deficit[0], - deficit[1], - deficit[2], - deficit[3]); + "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n" + "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" + "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", + rx_airtime, tx_airtime, sta->airtime_weight, + deficit[0], deficit[1], deficit[2], deficit[3], + q_depth[0], q_depth[1], q_depth[2], q_depth[3], + q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], + q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -236,7 +241,25 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; - int ac; + u32 ac, q_limit_l, q_limit_h; + char _buf[100] = {}, *buf = _buf; + + if (count > sizeof(_buf)) + return -EINVAL; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + buf[sizeof(_buf) - 1] = '\0'; + if (sscanf(buf, "queue limit %u %u %u", &ac, &q_limit_l, &q_limit_h) + != 3) + return -EINVAL; + + if (ac >= IEEE80211_NUM_ACS) + return -EINVAL; + + sta->airtime[ac].aql_limit_low = q_limit_l; + sta->airtime[ac].aql_limit_high = q_limit_h; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 0a6ff01c68a9..d40744903fa9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -538,7 +538,6 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; - int err, changed = 0; sdata_assert_lock(sdata); @@ -560,13 +559,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) ifibss->chandef = sdata->csa_chandef; /* generate the beacon */ - err = ieee80211_ibss_csa_beacon(sdata, NULL); - if (err < 0) - return err; - - changed |= err; - - return changed; + return ieee80211_ibss_csa_beacon(sdata, NULL); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 05406e9c05b3..ad15b3be8bb3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1142,6 +1142,10 @@ struct ieee80211_local { u16 schedule_round[IEEE80211_NUM_ACS]; u16 airtime_flags; + u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; + u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; + u32 aql_threshold; + atomic_t aql_total_pending_airtime; const struct ieee80211_ops *ops; @@ -2249,6 +2253,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code); extern const struct ethtool_ops ieee80211_ethtool_ops; +u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + int len); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 2d05c4cfaf6d..6cca0853f183 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -667,8 +667,16 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, for (i = 0; i < IEEE80211_NUM_ACS; i++) { INIT_LIST_HEAD(&local->active_txqs[i]); spin_lock_init(&local->active_txq_lock[i]); + local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; + local->aql_txq_limit_high[i] = + IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } - local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX; + + local->airtime_flags = AIRTIME_USE_TX | + AIRTIME_USE_RX | + AIRTIME_USE_AQL; + local->aql_threshold = IEEE80211_AQL_THRESHOLD; + atomic_set(&local->aql_total_pending_airtime, 0); INIT_LIST_HEAD(&local->chanctx_list); mutex_init(&local->chanctx_mtx); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 54dd8849d1cc..5fa13176036f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3186,15 +3186,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, - struct ieee80211_mgmt *mgmt, size_t len) + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee802_11_elems *elems) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u8 *pos; u16 capab_info, aid; - struct ieee802_11_elems elems; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; @@ -3222,19 +3221,15 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ifmgd->broken_ap = true; } - pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); - - if (!elems.supp_rates) { + if (!elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); return false; } ifmgd->aid = aid; ifmgd->tdls_chan_switch_prohibited = - elems.ext_capab && elems.ext_capab_len >= 5 && - (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); + elems->ext_capab && elems->ext_capab_len >= 5 && + (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their @@ -3243,11 +3238,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems.wmm_param) || + if ((assoc_data->wmm && !elems->wmm_param) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.ht_cap_elem || !elems.ht_operation)) || + (!elems->ht_cap_elem || !elems->ht_operation)) || (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation))) { + (!elems->vht_cap_elem || !elems->vht_operation))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3265,8 +3260,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, mgmt->bssid, assoc_data->bss->bssid); if (assoc_data->wmm && - !elems.wmm_param && bss_elems.wmm_param) { - elems.wmm_param = bss_elems.wmm_param; + !elems->wmm_param && bss_elems.wmm_param) { + elems->wmm_param = bss_elems.wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } @@ -3275,27 +3270,27 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ - if (!elems.ht_cap_elem && bss_elems.ht_cap_elem && + if (!elems->ht_cap_elem && bss_elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_cap_elem = bss_elems.ht_cap_elem; + elems->ht_cap_elem = bss_elems.ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } - if (!elems.ht_operation && bss_elems.ht_operation && + if (!elems->ht_operation && bss_elems.ht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems.ht_operation = bss_elems.ht_operation; + elems->ht_operation = bss_elems.ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems.vht_cap_elem && bss_elems.vht_cap_elem && + if (!elems->vht_cap_elem && bss_elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_cap_elem = bss_elems.vht_cap_elem; + elems->vht_cap_elem = bss_elems.vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } - if (!elems.vht_operation && bss_elems.vht_operation && + if (!elems->vht_operation && bss_elems.vht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems.vht_operation = bss_elems.vht_operation; + elems->vht_operation = bss_elems.vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } @@ -3306,7 +3301,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * they should be present here. This is just a safety net. */ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) { + (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; @@ -3314,7 +3309,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems.vht_cap_elem || !elems.vht_operation)) { + (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; @@ -3341,7 +3336,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - (!elems.he_cap || !elems.he_operation)) { + (!elems->he_cap || !elems->he_operation)) { mutex_unlock(&sdata->local->sta_mtx); sdata_info(sdata, "HE AP is missing HE capability/operation\n"); @@ -3350,23 +3345,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } /* Set up internal HT/VHT capabilities */ - if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems.ht_cap_elem, sta); + elems->ht_cap_elem, sta); - if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems.vht_cap_elem, sta); + elems->vht_cap_elem, sta); - if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - elems.he_cap) { + if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, - elems.he_cap, - elems.he_cap_len, + elems->he_cap, + elems->he_cap_len, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3374,14 +3369,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (bss_conf->he_support) { bss_conf->bss_color = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->htc_trig_based_pkt_ext = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = - le32_get_bits(elems.he_operation->he_oper_params, + le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->multi_sta_back_32bit = @@ -3392,12 +3387,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_ACK_EN; - bss_conf->uora_exists = !!elems.uora_element; - if (elems.uora_element) - bss_conf->uora_ocw_range = elems.uora_element[0]; + bss_conf->uora_exists = !!elems->uora_element; + if (elems->uora_element) + bss_conf->uora_ocw_range = elems->uora_element[0]; - ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems.he_operation); - ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems.he_spr); + ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); + ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } @@ -3421,11 +3416,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ - if (elems.opmode_notif && - !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { + if (elems->opmode_notif && + !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; - nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; sta->sta.rx_nss = nss; @@ -3440,7 +3435,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta->sta.mfp = false; } - sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; + sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -3468,9 +3463,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); - } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len, - elems.mu_edca_param_set)) { + } else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3484,11 +3479,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } changed |= BSS_CHANGED_QOS; - if (elems.max_idle_period_ie) { + if (elems->max_idle_period_ie) { bss_conf->max_idle_period = - le16_to_cpu(elems.max_idle_period_ie->max_idle_period); + le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = - !!(elems.max_idle_period_ie->idle_options & + !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); changed |= BSS_CHANGED_KEEP_ALIVE; } else { @@ -3598,7 +3593,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { + if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, bss); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ee86c3333999..86bc469a28bc 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -70,7 +70,7 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) } /* return current EMWA throughput */ -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg) { int usecs; @@ -79,13 +79,13 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) usecs = 1000000; /* reset thr. below 10% success */ - if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100)) + if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100)) return 0; - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); else - return MINSTREL_TRUNC(100000 * (prob_ewma / usecs)); + return MINSTREL_TRUNC(100000 * (prob_avg / usecs)); } /* find & sort topmost throughput rates */ @@ -98,8 +98,8 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) for (j = MAX_THR_RATES; j > 0; --j) { tmp_mrs = &mi->r[tp_list[j - 1]].stats; - if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) <= - minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma)) + if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <= + minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg)) break; } @@ -157,20 +157,24 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * Recalculate statistics and counters of a given rate */ void -minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) +minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); - if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = cur_prob; + if (mp->new_avg) { + minstrel_filter_avg_add(&mrs->prob_avg, + &mrs->prob_avg_1, cur_prob); + } else if (unlikely(!mrs->att_hist)) { + mrs->prob_avg = cur_prob; } else { /*update exponential weighted moving avarage */ - mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - cur_prob, - EWMA_LEVEL); + mrs->prob_avg = minstrel_ewma(mrs->prob_avg, + cur_prob, + EWMA_LEVEL); } mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; @@ -200,12 +204,12 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; /* Update statistics of success probability per rate */ - minstrel_calc_rate_stats(mrs); + minstrel_calc_rate_stats(mp, mrs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || - mrs->prob_ewma < MINSTREL_FRAC(10, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || + mrs->prob_avg < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -225,14 +229,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is chosen as max_prob_rate */ - if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) { - tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma); + if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) { + tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg); tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], - tmp_mrs->prob_ewma); + tmp_mrs->prob_avg); if (tmp_cur_tp >= tmp_prob_tp) tmp_prob_rate = i; } else { - if (mrs->prob_ewma >= tmp_mrs->prob_ewma) + if (mrs->prob_avg >= tmp_mrs->prob_avg) tmp_prob_rate = i; } } @@ -290,7 +294,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->sample_deferred--; if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval * HZ) / 1000)) + mp->update_interval / (mp->new_avg ? 2 : 1))) minstrel_update_stats(mp, mi); } @@ -422,7 +426,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -573,7 +577,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 51d8b2c846e7..dbb43bcd3c45 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -19,6 +19,21 @@ #define MAX_THR_RATES 4 /* + * Coefficients for moving average with noise filter (period=16), + * scaled by 10 bits + * + * a1 = exp(-pi * sqrt(2) / period) + * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) + * coeff3 = -sqr(a1) + * coeff1 = 1 - coeff2 - coeff3 + */ +#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ + MINSTREL_AVG_COEFF2 - \ + MINSTREL_AVG_COEFF3) +#define MINSTREL_AVG_COEFF2 0x00001499 +#define MINSTREL_AVG_COEFF3 -0x0000092e + +/* * Perform EWMA (Exponentially Weighted Moving Average) calculation */ static inline int @@ -32,6 +47,37 @@ minstrel_ewma(int old, int new, int weight) return old + incr; } +static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) +{ + s32 out_1 = *prev_1; + s32 out_2 = *prev_2; + s32 val; + + if (!in) + in += 1; + + if (!out_1) { + val = out_1 = in; + goto out; + } + + val = MINSTREL_AVG_COEFF1 * in; + val += MINSTREL_AVG_COEFF2 * out_1; + val += MINSTREL_AVG_COEFF3 * out_2; + val >>= MINSTREL_SCALE; + + if (val > 1 << MINSTREL_SCALE) + val = 1 << MINSTREL_SCALE; + if (val < 0) + val = 1; + +out: + *prev_2 = out_1; + *prev_1 = val; + + return val; +} + struct minstrel_rate_stats { /* current / last sampling period attempts/success counters */ u16 attempts, last_attempts; @@ -40,8 +86,9 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u32 att_hist, succ_hist; - /* prob_ewma - exponential weighted moving average of prob */ - u16 prob_ewma; + /* prob_avg - moving average of prob */ + u16 prob_avg; + u16 prob_avg_1; /* maximum retry counts */ u8 retry_count; @@ -95,6 +142,7 @@ struct minstrel_sta_info { struct minstrel_priv { struct ieee80211_hw *hw; bool has_mrr; + bool new_avg; u32 sample_switch; unsigned int cw_min; unsigned int cw_max; @@ -126,8 +174,9 @@ extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); /* Recalculate success probabilities and counters for a given rate using EWMA */ -void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); +void minstrel_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs); +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg); /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index c8afd85b51a0..9b8e0daeb7bb 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -90,8 +90,8 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%6u ", mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -147,8 +147,8 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) p += sprintf(p, "%u,",mr->perfect_tx_time); tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 0ef2633349b5..694a31978a04 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -346,12 +346,12 @@ minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma) + int prob_avg) { unsigned int nsecs = 0; /* do not account throughput if sucess prob is below 10% */ - if (prob_ewma < MINSTREL_FRAC(10, 100)) + if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (group != MINSTREL_CCK_GROUP) @@ -365,11 +365,11 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ - if (prob_ewma > MINSTREL_FRAC(90, 100)) + if (prob_avg > MINSTREL_FRAC(90, 100)) return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) / nsecs)); else - return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs)); + return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs)); } /* @@ -389,13 +389,13 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, cur_group = index / MCS_GROUP_RATES; cur_idx = index % MCS_GROUP_RATES; - cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma; + cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || @@ -432,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from @@ -444,11 +444,11 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; - if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { + if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, - mrs->prob_ewma); + mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; @@ -458,9 +458,9 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { - if (mrs->prob_ewma > tmp_prob) + if (mrs->prob_avg > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > max_gpr_prob) + if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -482,12 +482,12 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { @@ -518,7 +518,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma; + tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { @@ -623,7 +623,7 @@ minstrel_ht_rate_sample_switch(struct minstrel_priv *mp, * If that fails, look again for a rate that is at least as fast */ mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); - faster_rate = mrs->prob_ewma > MINSTREL_FRAC(75, 100); + faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100); minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate); if (!n_rates && faster_rate) minstrel_ht_find_probe_rates(mi, rates, &n_rates, false); @@ -737,8 +737,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mrs = &mg->rates[i]; mrs->retry_updated = false; - minstrel_calc_rate_stats(mrs); - cur_prob = mrs->prob_ewma; + minstrel_calc_rate_stats(mp, mrs); + cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; @@ -773,6 +773,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, /* try to sample all available rates during each interval */ mi->sample_count *= 8; + if (mp->new_avg) + mi->sample_count /= 2; if (sample) minstrel_ht_rate_sample_switch(mp, mi); @@ -889,6 +891,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL; struct minstrel_priv *mp = priv; + u32 update_interval = mp->update_interval / 2; bool last, update = false; bool sample_status = false; int i; @@ -943,6 +946,10 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, switch (mi->sample_mode) { case MINSTREL_SAMPLE_IDLE: + if (mp->new_avg && + (mp->hw->max_rates > 1 || + mi->total_packets_cur < SAMPLE_SWITCH_THR)) + update_interval /= 2; break; case MINSTREL_SAMPLE_ACTIVE: @@ -970,23 +977,20 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && - MINSTREL_FRAC(rate->success, rate->attempts) < - MINSTREL_FRAC(20, 100)) { + rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && - MINSTREL_FRAC(rate2->success, rate2->attempts) < - MINSTREL_FRAC(20, 100)) { + rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } - if (time_after(jiffies, mi->last_stats_update + - (mp->update_interval / 2 * HZ) / 1000)) { + if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi, true); } @@ -1008,7 +1012,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) { + if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; @@ -1065,7 +1069,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { + if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; @@ -1099,11 +1103,11 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, } static inline int -minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate) +minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = rate / MCS_GROUP_RATES; rate %= MCS_GROUP_RATES; - return mi->groups[group].rates[rate].prob_ewma; + return mi->groups[group].rates[rate].prob_avg; } static int @@ -1115,7 +1119,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ - if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) + if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; @@ -1138,7 +1142,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || - (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < + (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; @@ -1243,7 +1247,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * rate, to avoid wasting airtime. */ sample_dur = minstrel_get_duration(sample_idx); - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) return -1; @@ -1666,7 +1670,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) mp->has_mrr = true; mp->hw = hw; - mp->update_interval = 100; + mp->update_interval = HZ / 10; + mp->new_avg = true; #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; @@ -1674,6 +1679,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) &mp->fixed_rate_idx); debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir, &mp->sample_switch); + debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir, + &mp->new_avg); #endif minstrel_ht_init_cck_rates(mp); @@ -1698,7 +1705,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) i = mi->max_tp_rate[0] / MCS_GROUP_RATES; j = mi->max_tp_rate[0] % MCS_GROUP_RATES; - prob = mi->groups[i].rates[j].prob_ewma; + prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index f938701e7ab7..53ea3c29debf 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -119,6 +119,6 @@ struct minstrel_ht_sta_priv { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, - int prob_ewma); + int prob_avg); #endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5a6e9f3edc04..bebb71917742 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -98,8 +98,8 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%6u ", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " @@ -243,8 +243,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%u,", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg); + eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8d3a2389b055..8eafd81e97b4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -210,6 +210,20 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, return NULL; } +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr) +{ + struct rhlist_head *tmp; + struct sta_info *sta; + + for_each_sta_info(local, sta_addr, sta, tmp) { + if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) + return sta; + } + + return NULL; +} + struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { @@ -396,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; + atomic_set(&sta->airtime[i].aql_tx_pending, 0); + sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; + sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) @@ -1893,6 +1910,41 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, } EXPORT_SYMBOL(ieee80211_sta_register_airtime); +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed) +{ + int tx_pending; + + if (!tx_completed) { + if (sta) + atomic_add(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + + atomic_add(tx_airtime, &local->aql_total_pending_airtime); + return; + } + + if (sta) { + tx_pending = atomic_sub_return(tx_airtime, + &sta->airtime[ac].aql_tx_pending); + if (WARN_ONCE(tx_pending < 0, + "STA %pM AC %d txq pending airtime underflow: %u, %u", + sta->addr, ac, tx_pending, tx_airtime)) + atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, + tx_pending, 0); + } + + tx_pending = atomic_sub_return(tx_airtime, + &local->aql_total_pending_airtime); + if (WARN_ONCE(tx_pending < 0, + "Device %s AC %d pending airtime underflow: %u, %u", + wiphy_name(local->hw.wiphy), ac, tx_pending, + tx_airtime)) + atomic_cmpxchg(&local->aql_total_pending_airtime, + tx_pending, 0); +} + int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 369c2dddce52..ad5d8a4ae56d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -127,13 +127,21 @@ enum ieee80211_agg_stop_reason { /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */ #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) +#define AIRTIME_USE_AQL BIT(2) struct airtime_info { u64 rx_airtime; u64 tx_airtime; s64 deficit; + atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ + u32 aql_limit_low; + u32 aql_limit_high; }; +void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, + struct sta_info *sta, u8 ac, + u16 tx_airtime, bool tx_completed); + struct sta_info; /** @@ -725,6 +733,10 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); +/* user must hold sta_mtx or be in RCU critical section */ +struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, + const u8 *sta_addr, const u8 *vif_addr); + #define for_each_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ sta_info_hash_lookup(local, _addr), hash_node) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ab8ba5835ca0..b720feaf9a74 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -670,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; + if (tx_time_est) { + struct sta_info *sta; + + rcu_read_lock(); + + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + rcu_read_unlock(); + } + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; @@ -877,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; + u16 tx_time_est; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -986,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_sta_register_airtime(&sta->sta, tid, info->status.tx_time, 0); + if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) { + /* Do this here to avoid the expensive lookup of the sta + * in ieee80211_report_used_skb(). + */ + ieee80211_sta_update_pending_airtime(local, sta, + skb_get_queue_mapping(skb), + tx_time_est, + true); + ieee80211_info_set_tx_time_est(info, 0); + } + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->status_stats.lost_packets) @@ -1030,7 +1056,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, I802_DEBUG_INC(local->dot11FailedCount); } - if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && + if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) && + ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { @@ -1073,19 +1100,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) .skb = skb, .info = IEEE80211_SKB_CB(skb), }; - struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); - for_each_sta_info(local, hdr->addr1, sta, tmp) { - /* skip wrong virtual interface */ - if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) - continue; - + sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); + if (sta) status.sta = &sta->sta; - break; - } __ieee80211_tx_status(hw, &status); rcu_read_unlock(); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1fa422782905..b696b9136f4c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1617,7 +1617,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { @@ -1679,7 +1679,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; - control.sta = sta; + control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); @@ -1698,7 +1698,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; - struct ieee80211_sta *pubsta; struct sk_buff *skb; bool result = true; __le16 fc; @@ -1713,11 +1712,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, if (sta && !sta->uploaded) sta = NULL; - if (sta) - pubsta = &sta->sta; - else - pubsta = NULL; - switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { @@ -1744,8 +1738,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } - result = ieee80211_tx_frags(local, vif, pubsta, skbs, - txpending); + result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); ieee80211_tpt_led_trig_tx(local, fc, led_len); @@ -2277,6 +2270,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * isn't always enough to find the interface to use; for proper * VLAN/WDS support we will need a different mechanism (which * likely isn't going to be monitor interfaces). + * + * This is necessary, for example, for old hostapd versions that + * don't use nl80211-based management TX/RX. */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -2424,6 +2420,33 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_store_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, + u32 *info_flags) +{ + struct sk_buff *ack_skb = skb_clone_sk(skb); + u16 info_id = 0; + + if (ack_skb) { + unsigned long flags; + int id; + + spin_lock_irqsave(&local->ack_status_lock, flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x40, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (id >= 0) { + info_id = id; + *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + } else { + kfree_skb(ack_skb); + } + } + + return info_id; +} + /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for @@ -2717,26 +2740,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *ack_skb = skb_clone_sk(skb); - - if (ack_skb) { - unsigned long flags; - int id; - - spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (id >= 0) { - info_id = id; - info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else { - kfree_skb(ack_skb); - } - } - } + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags); /* * If the skb is shared we need to obtain our own copy. @@ -3529,7 +3534,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); - ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return true; } @@ -3549,6 +3554,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, WARN_ON_ONCE(softirq_count() == 0); + if (!ieee80211_txq_airtime_check(hw, txq)) + return NULL; + begin: spin_lock_bh(&fq->lock); @@ -3659,6 +3667,21 @@ begin: } IEEE80211_SKB_CB(skb)->control.vif = vif; + + if (local->airtime_flags & AIRTIME_USE_AQL) { + u32 airtime; + + airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, + skb->len); + if (airtime) { + airtime = ieee80211_info_set_tx_time_est(info, airtime); + ieee80211_sta_update_pending_airtime(local, tx.sta, + txq->ac, + airtime, + false); + } + } + return skb; out: @@ -3672,7 +3695,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_txq *ret = NULL; - struct txq_info *txqi = NULL; + struct txq_info *txqi = NULL, *head = NULL; + bool found_eligible_txq = false; spin_lock_bh(&local->active_txq_lock[ac]); @@ -3683,13 +3707,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) if (!txqi) goto out; + if (txqi == head) { + if (!found_eligible_txq) + goto out; + else + found_eligible_txq = false; + } + + if (!head) + head = txqi; + if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); + struct sta_info, sta); + bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); + s64 deficit = sta->airtime[txqi->txq.ac].deficit; + + if (aql_check) + found_eligible_txq = true; - if (sta->airtime[txqi->txq.ac].deficit < 0) { + if (deficit < 0) sta->airtime[txqi->txq.ac].deficit += sta->airtime_weight; + + if (deficit < 0 || !aql_check) { list_move_tail(&txqi->schedule_order, &local->active_txqs[txqi->txq.ac]); goto begin; @@ -3743,6 +3784,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, } EXPORT_SYMBOL(__ieee80211_schedule_txq); +bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct sta_info *sta; + struct ieee80211_local *local = hw_to_local(hw); + + if (!(local->airtime_flags & AIRTIME_USE_AQL)) + return true; + + if (!txq->sta) + return true; + + sta = container_of(txq->sta, struct sta_info, sta); + if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_low) + return true; + + if (atomic_read(&local->aql_total_pending_airtime) < + local->aql_threshold && + atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < + sta->airtime[txq->ac].aql_limit_high) + return true; + + return false; +} +EXPORT_SYMBOL(ieee80211_txq_airtime_check); + bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4fc075b612fe..5e9b2eb24349 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -120,7 +120,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o -nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_offload.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5d5bdf450091..78f046ec506f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + INIT_LIST_HEAD(&sublist); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + ret = nf_hook_slow(skb, state, e, 0); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + /* Put passed packets back on main list */ + list_splice(&sublist, head); +} +EXPORT_SYMBOL(nf_hook_slow_list); + /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 063df74b4647..1abd6f0dc227 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } #ifndef IP_SET_BITMAP_STORED_TIMEOUT -static inline bool +static bool mtype_is_filled(const struct mtype_elem *x) { return true; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 11ff9d4a7006..abe8f77d7d23 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem { u16 id; }; -static inline u32 +static u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; @@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, size_t dsize) { @@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, htonl(map->first_ip + id * map->hosts)); } -static inline int +static int bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, return true; } +static u32 +range_to_mask(u32 from, u32 to, u8 *bits) +{ + u32 mask = 0xFFFFFFFE; + + *bits = 32; + while (--(*bits) > 0 && mask && (to & mask) != from) + mask <<= 1; + + return mask; +} + static int bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1d4e63326e68..b618713297da 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -65,7 +65,7 @@ struct bitmap_ipmac_elem { unsigned char filled; } __aligned(__alignof__(u64)); -static inline u32 +static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; @@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { @@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return -EAGAIN; } -static inline int +static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; @@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, @@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, return 0; } -static inline int +static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { @@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, return IPSET_ADD_STORE_PLAIN_TIMEOUT; } -static inline int +static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { @@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } -static inline int +static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 704a0dda1609..23d6095cb196 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -46,7 +46,7 @@ struct bitmap_port_adt_elem { u16 id; }; -static inline u16 +static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; @@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port) /* Common functions */ -static inline int +static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { @@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, htons(map->first_port + id)); } -static inline int +static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } +static bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} + static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d73d1828216a..169e0a04f814 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,7 +35,7 @@ struct ip_set_net { static unsigned int ip_set_net_id __read_mostly; -static inline struct ip_set_net *ip_set_pernet(struct net *net) +static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } @@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); * serialized by ip_set_type_mutex. */ -static inline void +static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } -static inline void +static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); @@ -277,7 +277,7 @@ ip_set_free(void *members) } EXPORT_SYMBOL_GPL(ip_set_free); -static inline bool +static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; @@ -327,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +static u32 +ip_set_timeout_get(const unsigned long *timeout) +{ + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; +} + +static char * +ip_set_comment_uget(struct nlattr *tb) +{ + return nla_data(tb); +} + +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ +void +ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext) +{ + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); + size_t len = ext->comment ? strlen(ext->comment) : 0; + + if (unlikely(c)) { + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); + } + if (!len) + return; + if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) + len = IPSET_MAX_COMMENT_SIZE; + c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) + return; + strlcpy(c->str, ext->comment, len + 1); + set->ext_size += sizeof(*c) + strlen(c->str) + 1; + rcu_assign_pointer(comment->c, c); +} +EXPORT_SYMBOL_GPL(ip_set_init_comment); + +/* Used only when dumping a set, protected by rcu_read_lock() */ +static int +ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) +{ + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); + + if (!c) + return 0; + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); +} + +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ +static void +ip_set_comment_free(struct ip_set *set, void *ptr) +{ + struct ip_set_comment *comment = ptr; + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) + return; + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); +} + typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ @@ -353,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), - .destroy = (destroyer) ip_set_comment_free, + .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); -static inline bool +static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? @@ -448,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +static u64 +ip_set_get_bytes(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->bytes); +} + +static u64 +ip_set_get_packets(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->packets); +} + +static bool +ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) +{ + return nla_put_net64(skb, IPSET_ATTR_BYTES, + cpu_to_be64(ip_set_get_bytes(counter)), + IPSET_ATTR_PAD) || + nla_put_net64(skb, IPSET_ATTR_PACKETS, + cpu_to_be64(ip_set_get_packets(counter)), + IPSET_ATTR_PAD); +} + +static bool +ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) +{ + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask), + IPSET_ATTR_PAD)) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); +} + int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) @@ -473,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +static bool +ip_set_match_counter(u64 counter, u64 match, u8 op) +{ + switch (op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == match; + case IPSET_COUNTER_NE: + return counter != match; + case IPSET_COUNTER_LT: + return counter < match; + case IPSET_COUNTER_GT: + return counter > match; + } + return false; +} + +static void +ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) +{ + atomic64_add((long long)bytes, &(counter)->bytes); +} + +static void +ip_set_add_packets(u64 packets, struct ip_set_counter *counter) +{ + atomic64_add((long long)packets, &(counter)->packets); +} + +static void +ip_set_update_counter(struct ip_set_counter *counter, + const struct ip_set_ext *ext, u32 flags) +{ + if (ext->packets != ULLONG_MAX && + !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { + ip_set_add_bytes(ext->bytes, counter); + ip_set_add_packets(ext->packets, counter); + } +} + +static void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbinfo = *skbinfo; +} + bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) @@ -508,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions); * The set behind an index may change by swapping only, from userspace. */ -static inline void +static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -516,7 +682,7 @@ __ip_set_get(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -static inline void +static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -528,7 +694,7 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -543,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set) * so it can't be destroyed (or changed) under our foot. */ -static inline struct ip_set * +static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; @@ -672,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * */ -static inline void +static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; @@ -1255,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) +int +ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) +{ + u32 cadt_flags = 0; + + if (SET_WITH_TIMEOUT(set)) + if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(set->timeout)))) + return -EMSGSIZE; + if (SET_WITH_COUNTER(set)) + cadt_flags |= IPSET_FLAG_WITH_COUNTERS; + if (SET_WITH_COMMENT(set)) + cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; + + if (!cadt_flags) + return 0; + return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); +} +EXPORT_SYMBOL_GPL(ip_set_put_flags); + static int ip_set_dump_done(struct netlink_callback *cb) { diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2b8f959574b4..36615eb3eae1 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); #endif - -bool -ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) -{ - bool ret; - u8 proto; - - switch (pf) { - case NFPROTO_IPV4: - ret = ip_set_get_ip4_port(skb, src, port, &proto); - break; - case NFPROTO_IPV6: - ret = ip_set_get_ip6_port(skb, src, port, &proto); - break; - default: - return false; - } - if (!ret) - return ret; - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - return true; - default: - return false; - } -} -EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index d098d87bc331..7480ce55b5c8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -39,7 +39,7 @@ #ifdef IP_SET_HASH_WITH_MULTI #define AHASH_MAX(h) ((h)->ahash_max) -static inline u8 +static u8 tune_ahash_max(u8 curr, u32 multi) { u32 n; @@ -909,7 +909,7 @@ out: return ret; } -static inline int +static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f4432d9fcad0..5d6d68eaf6a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -44,7 +44,7 @@ struct hash_ip4_elem { /* Common functions */ -static inline bool +static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) @@ -63,7 +63,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; @@ -171,7 +171,7 @@ struct hash_ip6_elem { /* Common functions */ -static inline bool +static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) @@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline void +static void hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip6_netmask(ip, prefix); @@ -196,7 +196,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 4ce563eb927d..eceb7bc4a93a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -47,7 +47,7 @@ struct hash_ipmac4_elem { /* Common functions */ -static inline bool +static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) @@ -67,7 +67,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { @@ -154,7 +154,7 @@ struct hash_ipmac6_elem { /* Common functions */ -static inline bool +static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) @@ -175,7 +175,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7a1734aad0c5..aba1df617d6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -42,7 +42,7 @@ struct hash_ipmark4_elem { /* Common functions */ -static inline bool +static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) @@ -64,7 +64,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { @@ -165,7 +165,7 @@ struct hash_ipmark6_elem { /* Common functions */ -static inline bool +static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) @@ -187,7 +187,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 32e240658334..1ff228717e29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -47,7 +47,7 @@ struct hash_ipport4_elem { /* Common functions */ -static inline bool +static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) @@ -71,7 +71,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { @@ -202,7 +202,7 @@ struct hash_ipport6_elem { /* Common functions */ -static inline bool +static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) @@ -226,7 +226,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 15d419353179..fa88afd812fa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -46,7 +46,7 @@ struct hash_ipportip4_elem { u8 padding; }; -static inline bool +static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) @@ -72,7 +72,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { @@ -210,7 +210,7 @@ struct hash_ipportip6_elem { /* Common functions */ -static inline bool +static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) @@ -236,7 +236,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7a4d7afd4121..eef6ecfcb409 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -59,7 +59,7 @@ struct hash_ipportnet4_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) @@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); @@ -116,7 +116,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { @@ -308,7 +308,7 @@ struct hash_ipportnet6_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) @@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); @@ -365,7 +365,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index d94c585d33c5..0b61593165ef 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -37,7 +37,7 @@ struct hash_mac4_elem { /* Common functions */ -static inline bool +static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) @@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, return ether_addr_equal(e1->ether, e2->ether); } -static inline bool +static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) @@ -56,7 +56,7 @@ nla_put_failure: return true; } -static inline void +static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 3d932de0ad29..136cf0781d3a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -47,7 +47,7 @@ struct hash_net4_elem { /* Common functions */ -static inline bool +static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) @@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -97,7 +97,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { @@ -212,7 +212,7 @@ struct hash_net6_elem { /* Common functions */ -static inline bool +static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) @@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -262,7 +262,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 87b29f971226..be5e95a0d876 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -25,7 +25,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -57,12 +58,13 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) @@ -71,28 +73,30 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -103,7 +107,8 @@ static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -119,7 +124,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { @@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); @@ -280,12 +287,13 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) @@ -294,28 +302,30 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -326,7 +336,8 @@ static bool hash_netiface6_data_list(struct sk_buff *skb, const struct hash_netiface6_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -342,7 +353,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { @@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 4398322fad59..da4ef910b12d 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -52,7 +52,7 @@ struct hash_netnet4_elem { /* Common functions */ -static inline bool +static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) @@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -117,7 +117,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { @@ -282,7 +282,7 @@ struct hash_netnet6_elem { /* Common functions */ -static inline bool +static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) @@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -348,7 +348,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 799f2272cc65..34448df80fb9 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -57,7 +57,7 @@ struct hash_netport4_elem { /* Common functions */ -static inline bool +static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) @@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -112,7 +112,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { @@ -270,7 +270,7 @@ struct hash_netport6_elem { /* Common functions */ -static inline bool +static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) @@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -325,7 +325,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..934c1712cba8 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -56,7 +56,7 @@ struct hash_netportnet4_elem { /* Common functions */ -static inline bool +static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) @@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { @@ -126,7 +126,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { @@ -331,7 +331,7 @@ struct hash_netportnet6_elem { /* Common functions */ -static inline bool +static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) @@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { @@ -402,7 +402,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 67ac50104e6f..cd747c0962fd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu) kfree(e); } -static inline void +static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; @@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e) call_rcu(&e->rcu, __list_set_del_rcu); } -static inline void +static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 8b80ab794a92..512259f579d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2402,18 +2402,22 @@ estimator_fail: return -ENOMEM; } -static void __net_exit __ip_vs_cleanup(struct net *net) +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(ipvs); - ip_vs_app_net_cleanup(ipvs); - ip_vs_protocol_net_cleanup(ipvs); - ip_vs_control_net_cleanup(ipvs); - ip_vs_estimator_net_cleanup(ipvs); - IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); - net->ipvs = NULL; + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } } static int __net_init __ip_vs_dev_init(struct net *net) @@ -2429,27 +2433,32 @@ hook_fail: return ret; } -static void __net_exit __ip_vs_dev_cleanup(struct net *net) +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ - smp_wmb(); - ip_vs_sync_net_cleanup(ipvs); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, - .exit = __ip_vs_cleanup, + .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .init = __ip_vs_dev_init, - .exit = __ip_vs_dev_cleanup, + .exit_batch = __ip_vs_dev_cleanup_batch, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3cccc88ef817..3be7398901e0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) /* * Delete service by {netns} in the service table. - * Called by __ip_vs_cleanup() + * Called by __ip_vs_batch_cleanup() */ -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) +void ip_vs_service_nets_cleanup(struct list_head *net_list) { + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(ipvs, true); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index 78b074cd5464..c03066fdd5ca 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -5,7 +5,7 @@ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active - * connections , will keep all conections to the node with the highest weight + * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 888d3068a492..b1e300f8881b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5cd610b547e0..0af1898af2b8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -573,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -1417,7 +1416,6 @@ void nf_conntrack_free(struct nf_conn *ct) WARN_ON(atomic_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); - nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6fba74b5aaf7..7956c9f19899 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -30,6 +30,7 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) +#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) enum retry_state { STATE_CONGESTED, @@ -39,11 +40,11 @@ enum retry_state { static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) { - struct nf_conn *refs[16]; + struct nf_conn *refs[ECACHE_STACK_ALLOC]; + enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int evicted = 0; - enum retry_state ret = STATE_DONE; spin_lock(&pcpu->lock); @@ -54,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) if (!nf_ct_is_confirmed(ct)) continue; + /* This ecache access is safe because the ct is on the + * pcpu dying list and we hold the spinlock -- the entry + * cannot be free'd until after the lock is released. + * + * This is true even if ct has a refcount of 0: the + * cpu that is about to free the entry must remove it + * from the dying list and needs the lock to do so. + */ e = nf_ct_ecache_find(ct); if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; + /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means + * the worker owns this entry: the ct will remain valid + * until the worker puts its ct reference. + */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; @@ -189,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) if (notify == NULL) goto out_unlock; + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) + goto out_unlock; + e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; - /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index d4ed1e197921..c24e5b64b00c 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -34,21 +34,24 @@ void nf_ct_ext_destroy(struct nf_conn *ct) t->destroy(ct); rcu_read_unlock(); } + + kfree(ct->ext); } EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext *old, *new; struct nf_ct_ext_type *t; + struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); - old = ct->ext; - if (old) { + if (ct->ext) { + const struct nf_ct_ext *old = ct->ext; + if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; @@ -68,22 +71,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); - kmemleak_not_leak(old); - new = __krealloc(old, alloc, gfp); + new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!old) { + if (!ct->ext) memset(new->offset, 0, sizeof(new->offset)); - ct->ext = new; - } else if (new != old) { - kfree_rcu(old, rcu); - rcu_assign_pointer(ct->ext, new); - } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); + + ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e2d13cd18875..d8d33ef52ce0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -506,9 +506,45 @@ nla_put_failure: return -1; } +/* all these functions access ct->ext. Caller must either hold a reference + * on ct or prevent its deletion by holding either the bucket spinlock or + * pcpu dying list lock. + */ +static int ctnetlink_dump_extinfo(struct sk_buff *skb, + struct nf_conn *ct, u32 type) +{ + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + return -1; + + return 0; +} + +static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) +{ + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0) + return -1; + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && + (ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0)) + return -1; + + return 0; +} + static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct) + struct nf_conn *ct, bool extinfo) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; @@ -552,23 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secctx(skb, ct) < 0 || - ctnetlink_dump_labels(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0 || - ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || - ctnetlink_dump_ct_synproxy(skb, ct) < 0) + if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; - - if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -953,13 +975,11 @@ restart: if (!ctnetlink_filter_match(ct, cb->data)) continue; - rcu_read_lock(); res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, true); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1364,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - rcu_read_lock(); err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct); - rcu_read_unlock(); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); nf_ct_put(ct); if (err <= 0) goto free; @@ -1429,12 +1447,18 @@ restart: continue; cb->args[1] = 0; } - rcu_read_lock(); + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, dying ? true : false); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 097deba7441a..c2e3dff773bc 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 128245efe84a..9889d52eda82 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -14,24 +14,15 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> -struct flow_offload_entry { - struct flow_offload flow; - struct nf_conn *ct; - struct rcu_head rcu_head; -}; - static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void -flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, - struct nf_flow_route *route, +flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; - struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; - struct dst_entry *dst = route->tuple[dir].dst; + struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; @@ -39,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; - ft->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; - ft->mtu = ip6_dst_mtu_forward(dst); break; } @@ -52,37 +41,24 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->l4proto = ctt->dst.protonum; ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; - - ft->iifidx = other_dst->dev->ifindex; - ft->dst_cache = dst; } -struct flow_offload * -flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { - struct flow_offload_entry *entry; struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || !atomic_inc_not_zero(&ct->ct_general.use))) return NULL; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) goto err_ct_refcnt; - flow = &entry->flow; + flow->ct = ct; - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) - goto err_dst_cache_original; - - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) - goto err_dst_cache_reply; - - entry->ct = ct; - - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) flow->flags |= FLOW_OFFLOAD_SNAT; @@ -91,10 +67,6 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) return flow; -err_dst_cache_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); -err_dst_cache_original: - kfree(entry); err_ct_refcnt: nf_ct_put(ct); @@ -102,6 +74,56 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static int flow_offload_fill_route(struct flow_offload *flow, + const struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; + struct dst_entry *dst = route->tuple[dir].dst; + + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + flow_tuple->mtu = ip6_dst_mtu_forward(dst); + break; + } + + flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->dst_cache = dst; + + return 0; +} + +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route) +{ + int err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + if (err < 0) + return err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); + if (err < 0) + goto err_route_reply; + + flow->type = NF_FLOW_OFFLOAD_ROUTE; + + return 0; + +err_route_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + + return err; +} +EXPORT_SYMBOL_GPL(flow_offload_route_init); + static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->state = TCP_CONNTRACK_ESTABLISHED; @@ -150,17 +172,25 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) flow_offload_fixup_ct_timeout(ct); } -void flow_offload_free(struct flow_offload *flow) +static void flow_offload_route_release(struct flow_offload *flow) { - struct flow_offload_entry *e; - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); - e = container_of(flow, struct flow_offload_entry, flow); +} + +void flow_offload_free(struct flow_offload *flow) +{ + switch (flow->type) { + case NF_FLOW_OFFLOAD_ROUTE: + flow_offload_route_release(flow); + break; + default: + break; + } if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(e->ct, 0, 0); - nf_ct_put(e->ct); - kfree_rcu(e, rcu_head); + nf_ct_delete(flow->ct, 0, 0); + nf_ct_put(flow->ct); + kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); @@ -220,6 +250,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } + if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) + nf_flow_offload_add(flow_table, flow); + return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -232,8 +265,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { - struct flow_offload_entry *e; - rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); @@ -241,25 +272,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - e = container_of(flow, struct flow_offload_entry, flow); - clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(e->ct); + flow_offload_fixup_ct(flow->ct); else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) - flow_offload_fixup_ct_timeout(e->ct); + flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { - struct flow_offload_entry *e; - flow->flags |= FLOW_OFFLOAD_TEARDOWN; - e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct_state(e->ct); + flow_offload_fixup_ct_state(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -269,7 +296,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table, { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; - struct flow_offload_entry *e; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, @@ -282,8 +308,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) return NULL; - e = container_of(flow, struct flow_offload_entry, flow); - if (unlikely(nf_ct_is_dying(e->ct))) + if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; @@ -327,12 +352,21 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - struct flow_offload_entry *e; - e = container_of(flow, struct flow_offload_entry, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) - flow_offload_del(flow_table, flow); + if (flow->flags & FLOW_OFFLOAD_HW) + nf_flow_offload_stats(flow_table, flow); + + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || + (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { + if (flow->flags & FLOW_OFFLOAD_HW) { + if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) + nf_flow_offload_del(flow_table, flow); + else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) + flow_offload_del(flow_table, flow); + } else { + flow_offload_del(flow_table, flow); + } + } } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -465,6 +499,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) int err; INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + flow_block_init(&flowtable->flow_block); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -485,15 +520,13 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - struct flow_offload_entry *e; - - e = container_of(flow, struct flow_offload_entry, flow); if (!dev) { flow_offload_teardown(flow); return; } - if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && + + if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_dead(flow); @@ -502,6 +535,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { + nf_flow_table_offload_flush(flowtable); nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); } @@ -529,5 +563,18 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int __init nf_flow_table_module_init(void) +{ + return nf_flow_table_offload_init(); +} + +static void __exit nf_flow_table_module_exit(void) +{ + nf_flow_table_offload_exit(); +} + +module_init(nf_flow_table_module_init); +module_exit(nf_flow_table_module_exit); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 593357aedb36..88bedf1ff1ae 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,9 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; } +static int nf_flow_rule_route_inet(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + int err; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); + break; + case NFPROTO_IPV6: + err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); + break; + default: + err = -1; + break; + } + + return err; +} + static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_inet, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c new file mode 100644 index 000000000000..c54c9a6cc981 --- /dev/null +++ b/net/netfilter/nf_flow_table_offload.c @@ -0,0 +1,851 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <linux/tc_act/tc_csum.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +static struct work_struct nf_flow_offload_work; +static DEFINE_SPINLOCK(flow_offload_pending_list_lock); +static LIST_HEAD(flow_offload_pending_list); + +struct flow_offload_work { + struct list_head list; + enum flow_cls_command cmd; + int priority; + struct nf_flowtable *flowtable; + struct flow_offload *flow; +}; + +struct nf_flow_key { + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + }; + struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nf_flow_match { + struct flow_dissector dissector; + struct nf_flow_key key; + struct nf_flow_key mask; +}; + +struct nf_flow_rule { + struct nf_flow_match match; + struct flow_rule *rule; +}; + +#define NF_FLOW_DISSECTOR(__match, __type, __field) \ + (__match)->dissector.offset[__type] = \ + offsetof(struct nf_flow_key, __field) + +static int nf_flow_rule_match(struct nf_flow_match *match, + const struct flow_offload_tuple *tuple) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + + switch (tuple->l3proto) { + case AF_INET: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key->basic.n_proto = htons(ETH_P_IP); + key->ipv4.src = tuple->src_v4.s_addr; + mask->ipv4.src = 0xffffffff; + key->ipv4.dst = tuple->dst_v4.s_addr; + mask->ipv4.dst = 0xffffffff; + break; + default: + return -EOPNOTSUPP; + } + mask->basic.n_proto = 0xffff; + + switch (tuple->l4proto) { + case IPPROTO_TCP: + key->tcp.flags = 0; + mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + break; + case IPPROTO_UDP: + break; + default: + return -EOPNOTSUPP; + } + + key->basic.ip_proto = tuple->l4proto; + mask->basic.ip_proto = 0xff; + + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_PORTS); + return 0; +} + +static void flow_offload_mangle(struct flow_action_entry *entry, + enum flow_action_mangle_base htype, + u32 offset, u8 *value, u8 *mask) +{ + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.offset = offset; + memcpy(&entry->mangle.mask, mask, sizeof(u32)); + memcpy(&entry->mangle.val, value, sizeof(u32)); +} + +static inline struct flow_action_entry * +flow_action_entry_next(struct nf_flow_rule *flow_rule) +{ + int i = flow_rule->rule->action.num_entries++; + + return &flow_rule->rule->action.entries[i]; +} + +static int flow_offload_eth_src(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + struct net_device *dev; + u32 mask, val; + u16 val16; + + dev = dev_get_by_index(net, tuple->iifidx); + if (!dev) + return -ENOENT; + + mask = ~0xffff0000; + memcpy(&val16, dev->dev_addr, 2); + val = val16 << 16; + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + (u8 *)&val, (u8 *)&mask); + + mask = ~0xffffffff; + memcpy(&val, dev->dev_addr + 2, 4); + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, + (u8 *)&val, (u8 *)&mask); + dev_put(dev); + + return 0; +} + +static int flow_offload_eth_dst(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + struct neighbour *n; + u32 mask, val; + u16 val16; + + n = dst_neigh_lookup(tuple->dst_cache, &tuple->dst_v4); + if (!n) + return -ENOENT; + + mask = ~0xffffffff; + memcpy(&val, n->ha, 4); + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, + (u8 *)&val, (u8 *)&mask); + + mask = ~0x0000ffff; + memcpy(&val16, n->ha + 4, 2); + val = val16; + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + (u8 *)&val, (u8 *)&mask); + neigh_release(n); + + return 0; +} + +static void flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + (u8 *)&addr, (u8 *)&mask); +} + +static void flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + (u8 *)&addr, (u8 *)&mask); +} + +static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, + unsigned int offset, + u8 *addr, u8 *mask) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + entry = flow_action_entry_next(flow_rule); + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + offset + i, + &addr[i], mask); + } +} + +static void flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const u8 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr; + offset = offsetof(struct ipv6hdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr; + offset = offsetof(struct ipv6hdr, daddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask); +} + +static void flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const u8 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr; + offset = offsetof(struct ipv6hdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr; + offset = offsetof(struct ipv6hdr, saddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask); +} + +static int flow_offload_l4proto(const struct flow_offload *flow) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + u8 type = 0; + + switch (protonum) { + case IPPROTO_TCP: + type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; + break; + case IPPROTO_UDP: + type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; + break; + default: + break; + } + + return type; +} + +static void flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffff0000); + __be16 port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + offset = 0; /* offsetof(struct tcphdr, source); */ + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + offset = 0; /* offsetof(struct tcphdr, dest); */ + break; + default: + break; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + (u8 *)&port, (u8 *)&mask); +} + +static void flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffff); + __be16 port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + offset = 0; /* offsetof(struct tcphdr, source); */ + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + offset = 0; /* offsetof(struct tcphdr, dest); */ + break; + default: + break; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + (u8 *)&port, (u8 *)&mask); +} + +static void flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; + + switch (protonum) { + case IPPROTO_TCP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + break; + case IPPROTO_UDP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + break; + } +} + +static void flow_offload_redirect(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + struct rtable *rt; + + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = rt->dst.dev; + dev_hold(rt->dst.dev); +} + +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (flow->flags & FLOW_OFFLOAD_SNAT) { + flow_offload_ipv4_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (flow->flags & FLOW_OFFLOAD_DNAT) { + flow_offload_ipv4_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + if (flow->flags & FLOW_OFFLOAD_SNAT || + flow->flags & FLOW_OFFLOAD_DNAT) + flow_offload_ipv4_checksum(net, flow, flow_rule); + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (flow->flags & FLOW_OFFLOAD_SNAT) { + flow_offload_ipv6_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (flow->flags & FLOW_OFFLOAD_DNAT) { + flow_offload_ipv6_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); + +#define NF_FLOW_RULE_ACTION_MAX 16 + +static struct nf_flow_rule * +nf_flow_offload_rule_alloc(struct net *net, + const struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload *flow = offload->flow; + const struct flow_offload_tuple *tuple; + struct nf_flow_rule *flow_rule; + int err = -ENOMEM; + + flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + if (!flow_rule) + goto err_flow; + + flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); + if (!flow_rule->rule) + goto err_flow_rule; + + flow_rule->rule->match.dissector = &flow_rule->match.dissector; + flow_rule->rule->match.mask = &flow_rule->match.mask; + flow_rule->rule->match.key = &flow_rule->match.key; + + tuple = &flow->tuplehash[dir].tuple; + err = nf_flow_rule_match(&flow_rule->match, tuple); + if (err < 0) + goto err_flow_match; + + flow_rule->rule->action.num_entries = 0; + if (flowtable->type->action(net, flow, dir, flow_rule) < 0) + goto err_flow_match; + + return flow_rule; + +err_flow_match: + kfree(flow_rule->rule); +err_flow_rule: + kfree(flow_rule); +err_flow: + return NULL; +} + +static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < flow_rule->rule->action.num_entries; i++) { + entry = &flow_rule->rule->action.entries[i]; + if (entry->id != FLOW_ACTION_REDIRECT) + continue; + + dev_put(entry->dev); + } + kfree(flow_rule->rule); + kfree(flow_rule); +} + +static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) +{ + int i; + + for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) + __nf_flow_offload_destroy(flow_rule[i]); +} + +static int nf_flow_offload_alloc(const struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + struct net *net = read_pnet(&offload->flowtable->net); + + flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_ORIGINAL); + if (!flow_rule[0]) + return -ENOMEM; + + flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_REPLY); + if (!flow_rule[1]) { + __nf_flow_offload_destroy(flow_rule[0]); + return -ENOMEM; + } + + return 0; +} + +static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, + __be16 proto, int priority, + enum flow_cls_command cmd, + const struct flow_offload_tuple *tuple, + struct netlink_ext_ack *extack) +{ + cls_flow->common.protocol = proto; + cls_flow->common.prio = priority; + cls_flow->common.extack = extack; + cls_flow->command = cmd; + cls_flow->cookie = (unsigned long)tuple; +} + +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + int err, i = 0; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_REPLACE, + &offload->flow->tuplehash[dir].tuple, &extack); + cls_flow.rule = flow_rule->rule; + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) { + err = block_cb->cb(TC_SETUP_FT, &cls_flow, + block_cb->cb_priv); + if (err < 0) + continue; + + i++; + } + + return i; +} + +static void flow_offload_tuple_del(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_DESTROY, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv); + + offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD; +} + +static int flow_offload_rule_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + int ok_count = 0; + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + + return 0; +} + +static int flow_offload_work_add(struct flow_offload_work *offload) +{ + struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; + int err; + + err = nf_flow_offload_alloc(offload, flow_rule); + if (err < 0) + return -ENOMEM; + + err = flow_offload_rule_add(offload, flow_rule); + + nf_flow_offload_destroy(flow_rule); + + return err; +} + +static void flow_offload_work_del(struct flow_offload_work *offload) +{ + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); +} + +static void flow_offload_tuple_stats(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir, + struct flow_stats *stats) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_STATS, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv); + memcpy(stats, &cls_flow.stats, sizeof(*stats)); +} + +static void flow_offload_work_stats(struct flow_offload_work *offload) +{ + struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, + lastused + NF_FLOW_TIMEOUT); +} + +static void flow_offload_work_handler(struct work_struct *work) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + int ret; + + spin_lock_bh(&flow_offload_pending_list_lock); + list_replace_init(&flow_offload_pending_list, &offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + switch (offload->cmd) { + case FLOW_CLS_REPLACE: + ret = flow_offload_work_add(offload); + if (ret < 0) + offload->flow->flags &= ~FLOW_OFFLOAD_HW; + break; + case FLOW_CLS_DESTROY: + flow_offload_work_del(offload); + break; + case FLOW_CLS_STATS: + flow_offload_work_stats(offload); + break; + default: + WARN_ON_ONCE(1); + } + list_del(&offload->list); + kfree(offload); + } +} + +static void flow_offload_queue_work(struct flow_offload_work *offload) +{ + spin_lock_bh(&flow_offload_pending_list_lock); + list_add_tail(&offload->list, &flow_offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + schedule_work(&nf_flow_offload_work); +} + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_REPLACE; + offload->flow = flow; + offload->priority = flowtable->priority; + offload->flowtable = flowtable; + flow->flags |= FLOW_OFFLOAD_HW; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_DESTROY; + offload->flow = flow; + offload->flow->flags |= FLOW_OFFLOAD_HW_DYING; + offload->flowtable = flowtable; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + s64 delta; + + delta = flow->timeout - jiffies; + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) || + flow->flags & FLOW_OFFLOAD_HW_DYING) + return; + + offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return; + + offload->cmd = FLOW_CLS_STATS; + offload->flow = flow; + offload->flowtable = flowtable; + + flow_offload_queue_work(offload); +} + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) +{ + if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD) + flush_work(&nf_flow_offload_work); +} + +static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + struct flow_block_cb *block_cb, *next; + int err = 0; + + switch (cmd) { + case FLOW_BLOCK_BIND: + list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); + break; + case FLOW_BLOCK_UNBIND: + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo = {}; + int err; + + if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + return 0; + + if (!dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + bo.net = dev_net(dev); + bo.block = &flowtable->flow_block; + bo.command = cmd; + bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo.extack = &extack; + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + return nf_flow_table_block_setup(flowtable, &bo, cmd); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); + +int nf_flow_table_offload_init(void) +{ + INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + + return 0; +} + +void nf_flow_table_offload_exit(void) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + cancel_work_sync(&nf_flow_offload_work); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + list_del(&offload->list); + kfree(offload); + } +} diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 712a428509ad..ff04cdc87f76 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,11 +151,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static int nft_netdev_register_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err, j; + + j = 0; + list_for_each_entry(hook, hook_list, list) { + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_register; + + j++; + } + return 0; + +err_register: + list_for_each_entry(hook, hook_list, list) { + if (j-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + } + return err; +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} + +static int nft_register_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); +} + +static void nft_unregister_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -168,14 +221,14 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nf_register_net_hook(net, ops); + return nft_register_basechain_hooks(net, table->family, basechain); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -187,7 +240,7 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, ops); + nft_unregister_basechain_hooks(net, table->family, basechain); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -308,6 +361,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { + struct nft_flow_rule *flow; struct nft_trans *trans; int err; @@ -315,6 +369,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) if (trans == NULL) return -ENOMEM; + if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(ctx->net, rule); + if (IS_ERR(flow)) { + nft_trans_destroy(trans); + return PTR_ERR(flow); + } + + nft_trans_flow_rule(trans) = flow; + } + err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); @@ -742,7 +806,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nft_unregister_basechain_hooks(net, table->family, + nft_base_chain(chain)); } } @@ -757,14 +822,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nft_register_basechain_hooks(net, table->family, + nft_base_chain(chain)); if (err < 0) - goto err; + goto err_register_hooks; i++; } return 0; -err: + +err_register_hooks: if (i) nft_table_disable(net, table, i); return err; @@ -1225,6 +1292,46 @@ nla_put_failure: return -ENOSPC; } +static int nft_dump_basechain_hook(struct sk_buff *skb, int family, + const struct nft_base_chain *basechain) +{ + const struct nf_hook_ops *ops = &basechain->ops; + struct nft_hook *hook, *first = NULL; + struct nlattr *nest, *nest_devs; + int n = 0; + + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + + if (family == NFPROTO_NETDEV) { + nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (!first) + first = hook; + + if (nla_put_string(skb, NFTA_DEVICE_NAME, + hook->ops.dev->name)) + goto nla_put_failure; + n++; + } + nla_nest_end(skb, nest_devs); + + if (n == 1 && + nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + + return 0; +nla_put_failure: + return -1; +} + static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, @@ -1253,21 +1360,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; struct nft_stats __percpu *stats; - struct nlattr *nest; - nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); - if (nest == NULL) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + if (nft_dump_basechain_hook(skb, family, basechain)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) - goto nla_put_failure; - if (basechain->dev_name[0] && - nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) - goto nla_put_failure; - nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) @@ -1485,6 +1581,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; + struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; @@ -1495,6 +1592,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (ctx->family == NFPROTO_NETDEV) { + list_for_each_entry_safe(hook, next, + &basechain->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); @@ -1508,13 +1612,125 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) } } +static struct nft_hook *nft_netdev_hook_alloc(struct net *net, + const struct nlattr *attr) +{ + struct net_device *dev; + char ifname[IFNAMSIZ]; + struct nft_hook *hook; + int err; + + hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); + if (!hook) { + err = -ENOMEM; + goto err_hook_alloc; + } + + nla_strlcpy(ifname, attr, IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + if (!dev) { + err = -ENOENT; + goto err_hook_dev; + } + hook->ops.dev = dev; + + return hook; + +err_hook_dev: + kfree(hook); +err_hook_alloc: + return ERR_PTR(err); +} + +static bool nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) { + if (this->ops.dev == hook->ops.dev) + return true; + } + + return false; +} + +static int nf_tables_parse_netdev_hooks(struct net *net, + const struct nlattr *attr, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + const struct nlattr *tmp; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err_hook; + } + + hook = nft_netdev_hook_alloc(net, tmp); + if (IS_ERR(hook)) { + err = PTR_ERR(hook); + goto err_hook; + } + if (nft_hook_list_find(hook_list, hook)) { + err = -EEXIST; + goto err_hook; + } + list_add_tail(&hook->list, hook_list); + n++; + + if (n == NFT_NETDEVICE_MAX) { + err = -EFBIG; + goto err_hook; + } + } + if (!n) + return -EINVAL; + + return 0; + +err_hook: + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del(&hook->list); + kfree(hook); + } + return err; +} + struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; - struct net_device *dev; + struct list_head list; }; +static int nft_chain_parse_netdev(struct net *net, + struct nlattr *tb[], + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err; + + if (tb[NFTA_HOOK_DEV]) { + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + list_add_tail(&hook->list, hook_list); + } else if (tb[NFTA_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], + hook_list); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + return 0; +} + static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, @@ -1522,7 +1738,6 @@ static int nft_chain_parse_hook(struct net *net, { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; - struct net_device *dev; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1560,23 +1775,14 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; - hook->dev = NULL; + INIT_LIST_HEAD(&hook->list); if (family == NFPROTO_NETDEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { + err = nft_chain_parse_netdev(net, ha, &hook->list); + if (err < 0) { module_put(type->owner); - return -ENOENT; + return err; } - hook->dev = dev; - } else if (ha[NFTA_HOOK_DEV]) { + } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } @@ -1586,6 +1792,12 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { + struct nft_hook *h, *next; + + list_for_each_entry_safe(h, next, &hook->list, list) { + list_del(&h->list); + kfree(h); + } module_put(hook->type->owner); } @@ -1610,6 +1822,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha return kvmalloc(alloc, GFP_KERNEL); } +static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, + const struct nft_chain_hook *hook, + struct nft_chain *chain) +{ + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; +} + +static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + struct nft_chain_hook *hook, u32 flags) +{ + struct nft_chain *chain; + struct nft_hook *h; + + basechain->type = hook->type; + INIT_LIST_HEAD(&basechain->hook_list); + chain = &basechain->chain; + + if (family == NFPROTO_NETDEV) { + list_splice_init(&hook->list, &basechain->hook_list); + list_for_each_entry(h, &basechain->hook_list, list) + nft_basechain_hook_init(&h->ops, family, hook, chain); + + basechain->ops.hooknum = hook->num; + basechain->ops.priority = hook->priority; + } else { + nft_basechain_hook_init(&basechain->ops, family, hook, chain); + } + + chain->flags |= NFT_BASE_CHAIN | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + + flow_block_init(&basechain->flow_block); + + return 0; +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1628,7 +1883,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; - struct nf_hook_ops *ops; err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) @@ -1639,9 +1893,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_chain_release_hook(&hook); return -ENOMEM; } - - if (hook.dev != NULL) - strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); + chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1654,24 +1906,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - basechain->type = hook.type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = hook.type->hooks[ops->hooknum]; - ops->dev = hook.dev; - - chain->flags |= NFT_BASE_CHAIN | flags; - basechain->policy = NF_ACCEPT; - if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) - return -EOPNOTSUPP; - - flow_block_init(&basechain->flow_block); + err = nft_basechain_init(basechain, family, &hook, flags); + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); + return err; + } } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1731,6 +1971,25 @@ err1: return err; } +static bool nft_hook_list_equal(struct list_head *hook_list1, + struct list_head *hook_list2) +{ + struct nft_hook *hook; + int n = 0, m = 0; + + n = 0; + list_for_each_entry(hook, hook_list2, list) { + if (!nft_hook_list_find(hook_list1, hook)) + return false; + + n++; + } + list_for_each_entry(hook, hook_list1, list) + m++; + + return n == m; +} + static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags) { @@ -1762,12 +2021,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - ops = &basechain->ops; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (ctx->family == NFPROTO_NETDEV) { + if (!nft_hook_list_equal(&basechain->hook_list, + &hook.list)) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } else { + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority) { + nft_chain_release_hook(&hook); + return -EBUSY; + } } nft_chain_release_hook(&hook); } @@ -5580,6 +5846,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, @@ -5626,43 +5893,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static int nf_tables_parse_devices(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct net_device *dev_array[], int *len) -{ - const struct nlattr *tmp; - struct net_device *dev; - char ifname[IFNAMSIZ]; - int rem, n = 0, err; - - nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { - err = -EINVAL; - goto err1; - } - - nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = __dev_get_by_name(ctx->net, ifname); - if (!dev) { - err = -ENOENT; - goto err1; - } - - dev_array[n++] = dev; - if (n == NFT_FLOWTABLE_DEVICE_MAX) { - err = -EFBIG; - goto err1; - } - } - if (!len) - return -EINVAL; - - err = 0; -err1: - *len = n; - return err; -} - static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, @@ -5673,11 +5903,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable *flowtable) { - struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; - struct nf_hook_ops *ops; + struct nft_hook *hook; int hooknum, priority; - int err, n = 0, i; + int err; err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); @@ -5695,27 +5924,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], - dev_array, &n); + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable->hook_list); if (err < 0) return err; - ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - flowtable->hooknum = hooknum; - flowtable->priority = priority; - flowtable->ops = ops; - flowtable->ops_len = n; + flowtable->hooknum = hooknum; + flowtable->data.priority = priority; - for (i = 0; i < n; i++) { - flowtable->ops[i].pf = NFPROTO_NETDEV; - flowtable->ops[i].hooknum = hooknum; - flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data; - flowtable->ops[i].hook = flowtable->data.type->hook; - flowtable->ops[i].dev = dev_array[i]; + list_for_each_entry(hook, &flowtable->hook_list, list) { + hook->ops.pf = NFPROTO_NETDEV; + hook->ops.hooknum = hooknum; + hook->ops.priority = priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -5752,17 +5975,73 @@ nft_flowtable_type_get(struct net *net, u8 family) return ERR_PTR(-ENOENT); } +static void nft_unregister_flowtable_hook(struct net *net, + struct nft_flowtable *flowtable, + struct nft_hook *hook) +{ + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); +} + static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; + list_for_each_entry(hook, &flowtable->hook_list, list) + nft_unregister_flowtable_hook(net, flowtable, hook); +} + +static int nft_register_flowtable_net_hooks(struct net *net, + struct nft_table *table, + struct nft_flowtable *flowtable) +{ + struct nft_hook *hook, *hook2, *next; + struct nft_flowtable *ft; + int err, i = 0; + + list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(ft, &table->flowtables, list) { + list_for_each_entry(hook2, &ft->hook_list, list) { + if (hook->ops.dev == hook2->ops.dev && + hook->ops.pf == hook2->ops.pf) { + err = -EBUSY; + goto err_unregister_net_hooks; + } + } + } + + err = flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_BIND); + if (err < 0) + goto err_unregister_net_hooks; + + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) { + flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_UNBIND); + goto err_unregister_net_hooks; + } + + i++; + } + + return 0; - nf_unregister_net_hook(net, &flowtable->ops[i]); +err_unregister_net_hooks: + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (i-- <= 0) + break; + + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); } + + return err; } static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, @@ -5773,12 +6052,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; - struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_hook *hook, *next; struct nft_table *table; struct nft_ctx ctx; - int err, i, k; + int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || @@ -5817,6 +6097,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { @@ -5830,6 +6111,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err2; } + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flowtable->data.flags = + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + goto err3; + } + + write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) @@ -5840,43 +6129,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err4; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; - - list_for_each_entry(ft, &table->flowtables, list) { - for (k = 0; k < ft->ops_len; k++) { - if (!ft->ops[k].dev) - continue; - - if (flowtable->ops[i].dev == ft->ops[k].dev && - flowtable->ops[i].pf == ft->ops[k].pf) { - err = -EBUSY; - goto err5; - } - } - } - - err = nf_register_net_hook(net, &flowtable->ops[i]); - if (err < 0) - goto err5; - } + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + if (err < 0) + goto err4; err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err6; + goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err6: - i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[k]); - - kfree(flowtable->ops); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } err4: flowtable->data.type->free(&flowtable->data); err3: @@ -5943,8 +6213,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; + struct nft_hook *hook; struct nlmsghdr *nlh; - int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); @@ -5960,25 +6230,23 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD)) + NFTA_FLOWTABLE_PAD) || + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || - nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; - for (i = 0; i < flowtable->ops_len; i++) { - const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); - - if (dev && - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6169,7 +6437,12 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - kfree(flowtable->ops); + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_del_rcu(&hook->list); + kfree(hook); + } kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); module_put(flowtable->data.type->owner); @@ -6209,14 +6482,15 @@ nla_put_failure: static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev != dev) + list_for_each_entry(hook, &flowtable->hook_list, list) { + if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->ops[i].dev = NULL; + nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); break; } } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index e25dab8128db..68f17a6921d8 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -132,13 +132,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, common->extack = extack; } -static int nft_setup_cb_call(struct nft_base_chain *basechain, - enum tc_setup_type type, void *type_data) +static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, + struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { + list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -155,32 +155,46 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } +static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, + const struct nft_base_chain *basechain, + const struct nft_rule *rule, + const struct nft_flow_rule *flow, + struct netlink_ext_ack *extack, + enum flow_cls_command command) +{ + __be16 proto = ETH_P_ALL; + + memset(cls_flow, 0, sizeof(*cls_flow)); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow->common, proto, + basechain->ops.priority, extack); + cls_flow->command = command; + cls_flow->cookie = (unsigned long) rule; + if (flow) + cls_flow->rule = flow->rule; +} + static int nft_flow_offload_rule(struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { - struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; + struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + command); - if (flow) - proto = flow->proto; - - nft_flow_offload_common_init(&cls_flow.common, proto, - basechain->ops.priority, &extack); - cls_flow.command = command; - cls_flow.cookie = (unsigned long) rule; - if (flow) - cls_flow.rule = flow->rule; - - return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + &basechain->flow_block.cb_list); } static int nft_flow_offload_bind(struct flow_block_offload *bo, @@ -194,6 +208,18 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; + struct flow_cls_offload cls_flow; + struct netlink_ext_ack extack; + struct nft_chain *chain; + struct nft_rule *rule; + + chain = &basechain->chain; + list_for_each_entry(rule, &chain->rules, list) { + memset(&extack, 0, sizeof(extack)); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, + &extack, FLOW_CLS_DESTROY); + nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); + } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); @@ -224,20 +250,30 @@ static int nft_block_setup(struct nft_base_chain *basechain, return err; } +static void nft_flow_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + memset(bo, 0, sizeof(*bo)); + bo->net = net; + bo->block = &basechain->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; int err; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) @@ -253,17 +289,12 @@ static void nft_indr_block_ing_cmd(struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; + struct flow_block_offload bo; if (!chain) return; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); @@ -274,15 +305,10 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { - struct flow_block_offload bo = {}; struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - bo.net = dev_net(dev); - bo.block = &chain->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); flow_indr_block_call(dev, &bo, cmd); @@ -294,32 +320,122 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK -static int nft_flow_offload_chain(struct nft_chain *chain, - u8 *ppolicy, +static int nft_chain_offload_cmd(struct nft_base_chain *basechain, + struct net_device *dev, + enum flow_block_command cmd) +{ + int err; + + if (dev->netdev_ops->ndo_setup_tc) + err = nft_block_offload_cmd(basechain, dev, cmd); + else + err = nft_indr_block_offload_cmd(basechain, dev, cmd); + + return err; +} + +static int nft_flow_block_chain(struct nft_base_chain *basechain, + const struct net_device *this_dev, + enum flow_block_command cmd) +{ + struct net_device *dev; + struct nft_hook *hook; + int err, i = 0; + + list_for_each_entry(hook, &basechain->hook_list, list) { + dev = hook->ops.dev; + if (this_dev && this_dev != dev) + continue; + + err = nft_chain_offload_cmd(basechain, dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; + + return err; + } + i++; + } + + return 0; + +err_flow_block: + list_for_each_entry(hook, &basechain->hook_list, list) { + if (i-- <= 0) + break; + + dev = hook->ops.dev; + nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + } + return err; +} + +static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; - struct net_device *dev; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - dev = basechain->ops.dev; - if (!dev) - return -EOPNOTSUPP; - policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; - if (dev->netdev_ops->ndo_setup_tc) - return nft_block_offload_cmd(basechain, dev, cmd); - else - return nft_indr_block_offload_cmd(basechain, dev, cmd); + return nft_flow_block_chain(basechain, NULL, cmd); +} + +static void nft_flow_rule_offload_abort(struct net *net, + struct nft_trans *trans) +{ + int err = 0; + + list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_BIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); + break; + } + + if (WARN_ON_ONCE(err)) + break; + } } int nft_flow_rule_offload_commit(struct net *net) @@ -355,14 +471,14 @@ int nft_flow_rule_offload_commit(struct net *net) continue; if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) - return -EOPNOTSUPP; - + !(trans->ctx.flags & NLM_F_APPEND)) { + err = -EOPNOTSUPP; + break; + } err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) @@ -370,13 +486,31 @@ int nft_flow_rule_offload_commit(struct net *net) err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), - nft_trans_flow_rule(trans), - FLOW_CLS_DESTROY); + NULL, FLOW_CLS_DESTROY); break; } - if (err) - return err; + if (err) { + nft_flow_rule_offload_abort(net, trans); + break; + } + } + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + default: + break; + } } return err; @@ -386,6 +520,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) { struct nft_base_chain *basechain; struct net *net = dev_net(dev); + struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; @@ -398,8 +533,16 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; + found = NULL; basechain = nft_base_chain(chain); - if (strncmp(basechain->dev_name, dev->name, IFNAMSIZ)) + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = hook; + break; + } + if (!found) continue; return chain; @@ -427,18 +570,6 @@ static void nft_indr_block_cb(struct net_device *dev, mutex_unlock(&net->nft.commit_mutex); } -static void nft_offload_chain_clean(struct nft_chain *chain) -{ - struct nft_rule *rule; - - list_for_each_entry(rule, &chain->rules, list) { - nft_flow_offload_rule(chain, rule, - NULL, FLOW_CLS_DESTROY); - } - - nft_flow_offload_chain(chain, NULL, FLOW_BLOCK_UNBIND); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -449,7 +580,9 @@ static int nft_offload_netdev_event(struct notifier_block *this, mutex_lock(&net->nft.commit_mutex); chain = __nft_offload_get_chain(dev); if (chain) - nft_offload_chain_clean(chain); + nft_flow_block_chain(nft_base_chain(chain), dev, + FLOW_BLOCK_UNBIND); + mutex_unlock(&net->nft.commit_mutex); return NOTIFY_DONE; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..c78d01bc02e9 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + struct nft_hook *hook, *found = NULL; + int n = 0; - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - /* UNREGISTER events are also happpening on netns exit. - * - * Altough nf_tables core releases all tables/chains, only - * this event handler provides guarantee that - * basechain.ops->dev is still accessible, so we cannot - * skip exiting net namespaces. - */ - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; + if (event != NETDEV_UNREGISTER) + return; - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev == dev) + found = hook; + + n++; } + if (!found) + return; + + if (n > 1) { + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); + kfree_rcu(found, rcu); + return; + } + + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ + __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 0744b2bb46da..b8092069f868 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> +#include <linux/if_arp.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> @@ -125,6 +126,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; + if (reg->key == FLOW_DISSECTOR_KEY_META && + reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) && + nft_reg_load16(priv->data.data) != ARPHRD_ETHER) + return -EOPNOTSUPP; + nft_offload_update_dependency(ctx, &priv->data, priv->len); return 0; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index f29bbc74c4bf..dd82ff2ee19f 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -115,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (nft_flow_route(pkt, ct, &route, dir) < 0) goto err_flow_route; - flow = flow_offload_alloc(ct, &route); + flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; + if (flow_offload_route_init(flow, &route) < 0) + goto err_flow_add; + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 317e3a9e8c5b..9740b554fdb3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -33,19 +33,19 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -static u8 nft_meta_weekday(unsigned long secs) +static u8 nft_meta_weekday(time64_t secs) { unsigned int dse; u8 wday; secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest; - dse = secs / NFT_META_SECS_PER_DAY; + dse = div_u64(secs, NFT_META_SECS_PER_DAY); wday = (4 + dse) % NFT_META_DAYS_PER_WEEK; return wday; } -static u32 nft_meta_hour(unsigned long secs) +static u32 nft_meta_hour(time64_t secs) { struct tm tm; @@ -250,10 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store64(dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: - nft_reg_store8(dest, nft_meta_weekday(get_seconds())); + nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); break; case NFT_META_TIME_HOUR: - *dest = nft_meta_hour(get_seconds()); + *dest = nft_meta_hour(ktime_get_real_seconds()); break; default: WARN_ON(1); @@ -547,6 +547,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; + case NFT_META_IIF: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); + break; + case NFT_META_IIFTYPE: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 5cb2d8908d2a..1993af3a2979 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,50 +23,58 @@ #include <linux/ip.h> #include <linux/ipv6.h> +static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, + struct vlan_ethhdr *veth) +{ + if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN)) + return false; + + veth->h_vlan_proto = skb->vlan_proto; + veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth->h_vlan_encapsulated_proto = skb->protocol; + + return true; +} + /* add vlan header into the user buffer for if tag was removed by offloads */ static bool nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; - u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; + u8 vlan_hlen = 0; + + if ((skb->protocol == htons(ETH_P_8021AD) || + skb->protocol == htons(ETH_P_8021Q)) && + offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) + vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < ETH_HLEN) { - u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + if (offset < VLAN_ETH_HLEN + vlan_hlen) { + u8 ethlen = len; - if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + if (vlan_hlen && + skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) + return false; + else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - veth.h_vlan_proto = skb->vlan_proto; + if (offset + len > VLAN_ETH_HLEN + vlan_hlen) + ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; - memcpy(dst_u8, vlanh + offset, ethlen); + memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN; - } else if (offset >= VLAN_ETH_HLEN) { - offset -= VLAN_HLEN; - goto skip; + offset = ETH_HLEN + vlan_hlen; + } else { + offset -= VLAN_HLEN + vlan_hlen; } - veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); - veth.h_vlan_encapsulated_proto = skb->protocol; - - vlanh += offset; - - vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); - memcpy(dst_u8, vlanh, vlan_len); - - len -= vlan_len; - if (!len) - return true; - - dst_u8 += vlan_len; - skip: return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } @@ -174,6 +182,44 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; + case offsetof(struct ethhdr, h_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, + n_proto, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index be7798a50546..713fb38541df 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) return 0; /* Error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 8dbb4d48f2ed..67cb98489415 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y) * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ -static inline unsigned int localtime_1(struct xtm *r, time_t time) +static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ - v = time % SECONDS_PER_DAY; + div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; @@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time) return v; } -static inline void localtime_2(struct xtm *r, time_t time) +static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ - r->dse = time / 86400; + r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). @@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time) r->weekday = (4 + r->dse - 1) % 7 + 1; } -static void localtime_3(struct xtm *r, time_t time) +static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; @@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; - s64 stamp; + time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp @@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * 1. match before 13:00 * 2. match after 13:00 * - * If you match against processing time (get_seconds) it + * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ - stamp = get_seconds(); + stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ @@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) + * + * note: info->date_start/stop are unsigned 32-bit values that + * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index efccd1ac9a66..0522b2b1fd95 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -458,10 +458,63 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static struct genl_dumpit_info *genl_dumpit_info_alloc(void) +{ + return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); +} + +static void genl_dumpit_info_free(const struct genl_dumpit_info *info) +{ + kfree(info); +} + +static struct nlattr ** +genl_family_rcv_msg_attrs_parse(const struct genl_family *family, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, + enum genl_validate_flags no_strict_flag, + bool parallel) +{ + enum netlink_validation validate = ops->validate & no_strict_flag ? + NL_VALIDATE_LIBERAL : + NL_VALIDATE_STRICT; + struct nlattr **attrbuf; + int err; + + if (!family->maxattr) + return NULL; + + if (parallel) { + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); + } else { + attrbuf = family->attrbuf; + } + + err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, + family->policy, validate, extack); + if (err && parallel) { + kfree(attrbuf); + return ERR_PTR(err); + } + return attrbuf; +} + +static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, + struct nlattr **attrbuf, + bool parallel) +{ + if (parallel) + kfree(attrbuf); +} + static int genl_lock_start(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc = 0; if (ops->start) { @@ -474,8 +527,7 @@ static int genl_lock_start(struct netlink_callback *cb) static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_ops *ops = genl_dumpit_info(cb)->ops; int rc; genl_lock(); @@ -486,8 +538,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_done(struct netlink_callback *cb) { - /* our ops are always const - netlink API doesn't propagate that */ - const struct genl_ops *ops = cb->data; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; int rc = 0; if (ops->done) { @@ -495,120 +547,111 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); return rc; } -static int genl_family_rcv_msg(const struct genl_family *family, - struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int genl_parallel_done(struct netlink_callback *cb) { - const struct genl_ops *ops; - struct net *net = sock_net(skb->sk); - struct genl_info info; - struct genlmsghdr *hdr = nlmsg_data(nlh); - struct nlattr **attrbuf; - int hdrlen, err; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_ops *ops = info->ops; + int rc = 0; - /* this family doesn't exist in this netns */ - if (!family->netnsok && !net_eq(net, &init_net)) - return -ENOENT; + if (ops->done) + rc = ops->done(cb); + genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_dumpit_info_free(info); + return rc; +} - hdrlen = GENL_HDRLEN + family->hdrsize; - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; +static int genl_family_rcv_msg_dumpit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; + int err; - ops = genl_get_cmd(hdr->cmd, family); - if (ops == NULL) + if (!ops->dumpit) return -EOPNOTSUPP; - if ((ops->flags & GENL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if ((ops->flags & GENL_UNS_ADMIN_PERM) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - int rc; - - if (ops->dumpit == NULL) - return -EOPNOTSUPP; - - if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) { - int hdrlen = GENL_HDRLEN + family->hdrsize; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; - if (family->maxattr) { - unsigned int validate = NL_VALIDATE_STRICT; - - if (ops->validate & - GENL_DONT_VALIDATE_DUMP_STRICT) - validate = NL_VALIDATE_LIBERAL; - rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), - family->maxattr, - family->policy, - validate, extack); - if (rc) - return rc; - } - } + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - /* we have const, but the netlink API doesn't */ - .data = (void *)ops, - .start = genl_lock_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - }; + attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT, + true); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + /* Allocate dumpit info. It is going to be freed by done() callback. */ + info = genl_dumpit_info_alloc(); + if (!info) { + genl_family_rcv_msg_attrs_free(family, attrs, true); + return -ENOMEM; + } - genl_unlock(); - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); + info->family = family; + info->ops = ops; + info->attrs = attrs; - } else { - struct netlink_dump_control c = { - .module = family->module, - .start = ops->start, - .dump = ops->dumpit, - .done = ops->done, - }; + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = genl_lock_start, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; - rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_unlock(); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); - return rc; + } else { + struct netlink_dump_control c = { + .module = family->module, + .data = info, + .start = ops->start, + .dump = ops->dumpit, + .done = genl_parallel_done, + }; + + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } - if (ops->doit == NULL) - return -EOPNOTSUPP; - - if (family->maxattr && family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (attrbuf == NULL) - return -ENOMEM; - } else - attrbuf = family->attrbuf; + return err; +} - if (attrbuf) { - enum netlink_validation validate = NL_VALIDATE_STRICT; +static int genl_family_rcv_msg_doit(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + const struct genl_ops *ops, + int hdrlen, struct net *net) +{ + struct nlattr **attrbuf; + struct genl_info info; + int err; - if (ops->validate & GENL_DONT_VALIDATE_STRICT) - validate = NL_VALIDATE_LIBERAL; + if (!ops->doit) + return -EOPNOTSUPP; - err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - family->policy, validate, extack); - if (err < 0) - goto out; - } + attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, + ops, hdrlen, + GENL_DONT_VALIDATE_STRICT, + family->parallel_ops); + if (IS_ERR(attrbuf)) + return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; @@ -632,12 +675,49 @@ static int genl_family_rcv_msg(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - if (family->parallel_ops) - kfree(attrbuf); + genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); return err; } +static int genl_family_rcv_msg(const struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + const struct genl_ops *ops; + struct net *net = sock_net(skb->sk); + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen; + + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) + return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, + ops, hdrlen, net); + else + return genl_family_rcv_msg_doit(family, skb, nlh, extack, + ops, hdrlen, net); +} + static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1088,25 +1168,6 @@ problem: subsys_initcall(genl_init); -/** - * genl_family_attrbuf - return family's attrbuf - * @family: the family - * - * Return the family's attrbuf, while validating that it's - * actually valid to access it. - * - * You cannot use this function with a family that has parallel_ops - * and you can only use it within (pre/post) doit/dumpit callbacks. - */ -struct nlattr **genl_family_attrbuf(const struct genl_family *family) -{ - if (!WARN_ON(family->parallel_ops)) - lockdep_assert_held(&genl_mutex); - - return family->attrbuf; -} -EXPORT_SYMBOL(genl_family_attrbuf); - static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig index 97bd3a2c5c98..4822d6f46947 100644 --- a/net/nfc/hci/Kconfig +++ b/net/nfc/hci/Kconfig @@ -1,12 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only config NFC_HCI - depends on NFC - tristate "NFC HCI implementation" - default n - help - Say Y here if you want to build support for a kernel NFC HCI - implementation. This is mostly needed for devices that only process - HCI frames, like for example the NXP pn544. + depends on NFC + tristate "NFC HCI implementation" + default n + help + Say Y here if you want to build support for a kernel NFC HCI + implementation. This is mostly needed for devices that only process + HCI frames, like for example the NXP pn544. config NFC_SHDLC depends on NFC_HCI diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index afde0d763039..eee0dddb7749 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -102,22 +102,14 @@ nla_put_failure: static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { - struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; - int rc; u32 idx; - rc = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, - nfc_genl_policy, NULL); - if (rc < 0) - return ERR_PTR(rc); - - if (!attrbuf[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) @@ -1695,7 +1687,8 @@ static const struct genl_ops nfc_genl_ops[] = { }, { .cmd = NFC_CMD_GET_TARGET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1c77f520f474..12936c151cc0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -200,7 +200,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (err) return err; - flow_key->mpls.top_lse = lse; + flow_key->mpls.lse[0] = lse; return 0; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 05249eb45082..df9c80bf621d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -971,6 +971,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ct = nf_ct_get(skb, &ctinfo); if (ct) { + bool add_helper = false; + /* Packets starting a new connection must be NATted before the * helper, so that the helper knows about the NAT. We enforce * this by delaying both NAT and helper calls for unconfirmed @@ -988,16 +990,17 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Userspace may decide to perform a ct lookup without a helper - * specified followed by a (recirculate and) commit with one. - * Therefore, for unconfirmed connections which we will commit, - * we need to attach the helper here. + * specified followed by a (recirculate and) commit with one, + * or attach a helper in a later commit. Therefore, for + * connections which we will commit, we may need to attach + * the helper here. */ - if (!nf_ct_is_confirmed(ct) && info->commit && - info->helper && !nfct_help(ct)) { + if (info->commit && info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) return err; + add_helper = true; /* helper installed, add seqadj if NAT is required */ if (info->nat && !nfct_seqadj(ct)) { @@ -1007,11 +1010,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") for a - * confirmed connection, or + * - nf_conntrack_in() was executed above ("!cached") or a + * helper was just attached ("add_helper") for a confirmed + * connection, or * - When committing an unconfirmed connection. */ - if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : + info->commit) && ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d8c364d637b1..93d4991ddc1f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -227,7 +227,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -349,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ - + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -392,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, size_t len; unsigned int hlen; int err, dp_ifindex; + u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) @@ -484,23 +487,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, } /* Add OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru) { - if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, - upcall_info->mru)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (upcall_info->mru && + nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { + err = -ENOBUFS; + goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ - if (cutlen > 0) { - if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, - skb->len)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (cutlen > 0 && + nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { + err = -ENOBUFS; + goto out; + } + + /* Add OVS_PACKET_ATTR_HASH */ + hash = skb_get_hash_raw(skb); + if (skb->sw_hash) + hash |= OVS_PACKET_HASH_SW_BIT; + + if (skb->l4_hash) + hash |= OVS_PACKET_HASH_L4_BIT; + + if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { + err = -ENOBUFS; + goto out; } /* Only reserve room for attribute header, packet data is added @@ -542,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *input_vport; u16 mru = 0; + u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -567,6 +578,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) } OVS_CB(packet)->mru = mru; + if (a[OVS_PACKET_ATTR_HASH]) { + hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); + + __skb_set_hash(packet, hash & 0xFFFFFFFFULL, + !!(hash & OVS_PACKET_HASH_SW_BIT), + !!(hash & OVS_PACKET_HASH_L4_BIT)); + } + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -1575,6 +1594,31 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) return 0; } +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) +{ + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1583,7 +1627,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) @@ -1596,35 +1640,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_free_reply; + goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) - goto err_free_dp; + goto err_destroy_dp; - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; err = ovs_meters_init(dp); if (err) - goto err_destroy_ports_array; + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1656,6 +1691,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } + ovs_unlock(); goto err_destroy_meters; } @@ -1672,17 +1708,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) return 0; err_destroy_meters: - ovs_unlock(); ovs_meters_exit(dp); -err_destroy_ports_array: +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); -err_free_dp: +err_destroy_dp: kfree(dp); -err_free_reply: +err_destroy_reply: kfree_skb(reply); err: return err; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 81e85dde8217..e239a46c2f94 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -139,6 +139,18 @@ struct ovs_net { bool xt_label; }; +/** + * enum ovs_pkt_hash_types - hash info to include with a packet + * to send to userspace. + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash + * over transport ports. + */ +enum ovs_pkt_hash_types { + OVS_PACKET_HASH_SW_BIT = (1ULL << 32), + OVS_PACKET_HASH_L4_BIT = (1ULL << 33), +}; + extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 38147e6a20f5..9d375e74b607 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -637,27 +637,35 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - size_t stack_len = MPLS_HLEN; + u8 label_count = 1; + memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; - error = check_header(skb, skb->mac_len + stack_len); + error = check_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (unlikely(error)) return 0; memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); - if (stack_len == MPLS_HLEN) - memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + if (label_count <= MPLS_LABEL_DEPTH) + memcpy(&key->mpls.lse[label_count - 1], &lse, + MPLS_HLEN); - skb_set_inner_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + + label_count * MPLS_HLEN); if (lse & htonl(MPLS_LS_S_MASK)) break; - stack_len += MPLS_HLEN; + label_count++; } + if (label_count > MPLS_LABEL_DEPTH) + label_count = MPLS_LABEL_DEPTH; + + key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index b830d5ff7af4..fd8ed766bdd1 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -30,6 +30,7 @@ enum sw_flow_mac_proto { MAC_PROTO_ETHERNET, }; #define SW_FLOW_KEY_INVALID 0x80 +#define MPLS_LABEL_DEPTH 3 /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length @@ -85,9 +86,6 @@ struct sw_flow_key { */ union { struct { - __be32 top_lse; /* top label stack entry */ - } mpls; - struct { u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ u8 tos; /* IP ToS. */ u8 ttl; /* IP TTL/hop limit. */ @@ -135,6 +133,11 @@ struct sw_flow_key { } nd; }; } ipv6; + struct { + u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ + __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ + } mpls; + struct ovs_key_nsh nsh; /* network service header */ }; struct { @@ -166,7 +169,6 @@ struct sw_flow_key_range { struct sw_flow_mask { int ref_count; struct rcu_head rcu; - struct list_head list; struct sw_flow_key_range range; struct sw_flow_key key; }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d7559c64795d..65c2e3458ff5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -424,7 +424,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, - [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, @@ -1628,10 +1628,25 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; + u32 hdr_len; + u32 label_count, label_count_mask, i; mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); - SW_FLOW_KEY_PUT(match, mpls.top_lse, - mpls_key->mpls_lse, is_mask); + hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); + label_count = hdr_len / sizeof(struct ovs_key_mpls); + + if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || + hdr_len % sizeof(struct ovs_key_mpls)) + return -EINVAL; + + label_count_mask = GENMASK(label_count - 1, 0); + + for (i = 0 ; i < label_count; i++) + SW_FLOW_KEY_PUT(match, mpls.lse[i], + mpls_key[i].mpls_lse, is_mask); + + SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, + label_count_mask, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_MPLS); } @@ -2114,13 +2129,18 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); } else if (eth_p_mpls(swkey->eth.type)) { + u8 i, num_labels; struct ovs_key_mpls *mpls_key; - nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + num_labels = hweight_long(output->mpls.num_labels_mask); + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, + num_labels * sizeof(*mpls_key)); if (!nla) goto nla_put_failure; + mpls_key = nla_data(nla); - mpls_key->mpls_lse = output->mpls.top_lse; + for (i = 0; i < num_labels; i++) + mpls_key[i].mpls_lse = output->mpls.lse[i]; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -2406,13 +2426,14 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log); + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -2463,7 +2484,7 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return err; err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2478,7 +2499,7 @@ static int validate_and_copy_clone(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - bool log, bool last) + u32 mpls_label_count, bool log, bool last) { int start, err; u32 exec; @@ -2498,7 +2519,7 @@ static int validate_and_copy_clone(struct net *net, return err; err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2864,6 +2885,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last) { const struct nlattr *acts_if_greater, *acts_if_lesser_eq; @@ -2912,7 +2934,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2925,7 +2947,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, log); + eth_type, vlan_tci, mpls_label_count, log); if (err) return err; @@ -2952,7 +2974,8 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; @@ -3065,25 +3088,36 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, !eth_p_mpls(eth_type))) return -EINVAL; eth_type = mpls->mpls_ethertype; + mpls_label_count++; break; } - case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_MPLS: { + __be16 proto; if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; - /* Disallow subsequent L2.5+ set and mpls_pop actions - * as there is no check here to ensure that the new - * eth_type is valid and thus set actions could - * write off the end of the packet or otherwise - * corrupt it. + /* Disallow subsequent L2.5+ set actions and mpls_pop + * actions once the last MPLS label in the packet is + * is popped as there is no check here to ensure that + * the new eth type is valid and thus set actions could + * write off the end of the packet or otherwise corrupt + * it. * * Support for these actions is planned using packet * recirculation. */ - eth_type = htons(0); + proto = nla_get_be16(a); + mpls_label_count--; + + if (!eth_p_mpls(proto) || !mpls_label_count) + eth_type = htons(0); + else + eth_type = proto; + break; + } case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, @@ -3106,6 +3140,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_sample(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3176,6 +3211,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_clone(net, a, key, sfa, eth_type, vlan_tci, + mpls_label_count, log, last); if (err) return err; @@ -3188,8 +3224,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_check_pkt_len(net, a, key, sfa, eth_type, - vlan_tci, log, - last); + vlan_tci, + mpls_label_count, + log, last); if (err) return err; skip_copy = true; @@ -3219,14 +3256,18 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { int err; + u32 mpls_label_count = 0; *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); + if (eth_p_mpls(key->eth.type)) + mpls_label_count = hweight_long(key->mpls.num_labels_mask); + (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, log); + key->eth.vlan.tci, mpls_label_count, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf3582c5ed70..5904e93e5765 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -34,8 +34,13 @@ #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 +#define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_HASH_SHIFT 8 +#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) +#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) + static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; @@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static struct mask_array *tbl_mask_array_alloc(int size) +{ + struct mask_array *new; + + size = max(MASK_ARRAY_SIZE_MIN, size); + new = kzalloc(sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + if (!new) + return NULL; + + new->count = 0; + new->max = size; + + return new; +} + +static int tbl_mask_array_realloc(struct flow_table *tbl, int size) +{ + struct mask_array *old; + struct mask_array *new; + + new = tbl_mask_array_alloc(size); + if (!new) + return -ENOMEM; + + old = ovsl_dereference(tbl->mask_array); + if (old) { + int i; + + for (i = 0; i < old->max; i++) { + if (ovsl_dereference(old->masks[i])) + new->masks[new->count++] = old->masks[i]; + } + } + + rcu_assign_pointer(tbl->mask_array, new); + kfree_rcu(old, rcu); + + return 0; +} + +static int tbl_mask_array_add_mask(struct flow_table *tbl, + struct sw_flow_mask *new) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int err, ma_count = READ_ONCE(ma->count); + + if (ma_count >= ma->max) { + err = tbl_mask_array_realloc(tbl, ma->max + + MASK_ARRAY_SIZE_MIN); + if (err) + return err; + + ma = ovsl_dereference(tbl->mask_array); + } + + BUG_ON(ovsl_dereference(ma->masks[ma_count])); + + rcu_assign_pointer(ma->masks[ma_count], new); + WRITE_ONCE(ma->count, ma_count +1); + + return 0; +} + +static void tbl_mask_array_del_mask(struct flow_table *tbl, + struct sw_flow_mask *mask) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i, ma_count = READ_ONCE(ma->count); + + /* Remove the deleted mask pointers from the array */ + for (i = 0; i < ma_count; i++) { + if (mask == ovsl_dereference(ma->masks[i])) + goto found; + } + + BUG(); + return; + +found: + WRITE_ONCE(ma->count, ma_count -1); + + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); + RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + + kfree_rcu(mask, rcu); + + /* Shrink the mask array if necessary. */ + if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && + ma_count <= (ma->max / 3)) + tbl_mask_array_realloc(tbl, ma->max / 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) + tbl_mask_array_del_mask(tbl, mask); + } +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_array *ma; - ti = table_instance_alloc(TBL_MIN_BUCKETS); + table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * + MC_HASH_ENTRIES, + __alignof__(struct mask_cache_entry)); + if (!table->mask_cache) + return -ENOMEM; + ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); + if (!ma) + goto free_mask_cache; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) - return -ENOMEM; + goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) @@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); - INIT_LIST_HEAD(&table->mask_list); + rcu_assign_pointer(table->mask_array, ma); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); +free_mask_array: + kfree(ma); +free_mask_cache: + free_percpu(table->mask_cache); return -ENOMEM; } @@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) __table_instance_destroy(ti); } -static void table_instance_destroy(struct table_instance *ti, +static void table_instance_flow_free(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow, + bool count) +{ + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + if (count) + table->count--; + + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + + if (count) + table->ufid_count--; + } + + flow_mask_remove(table, flow->mask); +} + +static void table_instance_destroy(struct flow_table *table, + struct table_instance *ti, struct table_instance *ufid_ti, bool deferred) { @@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti, struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; - int ver = ti->node_ver; - int ufid_ver = ufid_ti->node_ver; - hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { - hlist_del_rcu(&flow->flow_table.node[ver]); - if (ovs_identifier_is_ufid(&flow->id)) - hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + hlist_for_each_entry_safe(flow, n, head, + flow_table.node[ti->node_ver]) { + + table_instance_flow_free(table, ti, ufid_ti, + flow, false); ovs_flow_free(flow, deferred); } } @@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table) struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - table_instance_destroy(ti, ufid_ti, false); + free_percpu(table->mask_cache); + kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + table_instance_destroy(table, ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(old_ti, old_ufid_ti, true); + table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); return 0; err_free_ti: @@ -370,13 +520,10 @@ err_free_ti: static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { - int key_start = range->start; - int key_end = range->end; - const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; + const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } @@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; @@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); + (*n_mask_hit)++; + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) @@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, return NULL; } -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, - const struct sw_flow_key *key, - u32 *n_mask_hit) +/* Flow lookup does full lookup on flow table. It starts with + * mask from index passed in *index. + */ +static struct sw_flow *flow_lookup(struct flow_table *tbl, + struct table_instance *ti, + struct mask_array *ma, + const struct sw_flow_key *key, + u32 *n_mask_hit, + u32 *index) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow *flow; struct sw_flow_mask *mask; + int i; + + if (likely(*index < ma->max)) { + mask = rcu_dereference_ovsl(ma->masks[*index]); + if (mask) { + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) + return flow; + } + } + + for (i = 0; i < ma->max; i++) { + + if (i == *index) + continue; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { /* Found */ + *index = i; + return flow; + } + } + + return NULL; +} + +/* + * mask_cache maps flow to probable mask. This cache is not tightly + * coupled cache, It means updates to mask list can result in inconsistent + * cache entry in mask cache. + * This is per cpu cache and is divided in MC_HASH_SEGS segments. + * In case of a hash collision the entry is hashed in next segment. + * */ +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 skb_hash, + u32 *n_mask_hit) +{ + struct mask_array *ma = rcu_dereference(tbl->mask_array); + struct table_instance *ti = rcu_dereference(tbl->ti); + struct mask_cache_entry *entries, *ce; struct sw_flow *flow; + u32 hash; + int seg; *n_mask_hit = 0; - list_for_each_entry_rcu(mask, &tbl->mask_list, list) { - (*n_mask_hit)++; - flow = masked_flow_lookup(ti, key, mask); - if (flow) /* Found */ + if (unlikely(!skb_hash)) { + u32 mask_index = 0; + + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + } + + /* Pre and post recirulation flows usually have the same skb_hash + * value. To avoid hash collisions, rehash the 'skb_hash' with + * 'recirc_id'. */ + if (key->recirc_id) + skb_hash = jhash_1word(skb_hash, key->recirc_id); + + ce = NULL; + hash = skb_hash; + entries = this_cpu_ptr(tbl->mask_cache); + + /* Find the cache entry 'ce' to operate on. */ + for (seg = 0; seg < MC_HASH_SEGS; seg++) { + int index = hash & (MC_HASH_ENTRIES - 1); + struct mask_cache_entry *e; + + e = &entries[index]; + if (e->skb_hash == skb_hash) { + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, + &e->mask_index); + if (!flow) + e->skb_hash = 0; return flow; + } + + if (!ce || e->skb_hash < ce->skb_hash) + ce = e; /* A better replacement cache candidate. */ + + hash >>= MC_HASH_SHIFT; } - return NULL; + + /* Cache miss, do full lookup. */ + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + if (flow) + ce->skb_hash = skb_hash; + + return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 index = 0; - return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); - struct sw_flow_mask *mask; - struct sw_flow *flow; + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i; /* Always called under ovs-mutex. */ - list_for_each_entry(mask, &tbl->mask_list, list) { - flow = masked_flow_lookup(ti, match->key, mask); + for (i = 0; i < ma->max; i++) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 __always_unused n_mask_hit; + struct sw_flow_mask *mask; + struct sw_flow *flow; + + mask = ovsl_dereference(ma->masks[i]); + if (!mask) + continue; + + flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && - ovs_flow_cmp_unmasked_key(flow, match)) + ovs_flow_cmp_unmasked_key(flow, match)) { return flow; + } } + return NULL; } @@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, int ovs_flow_tbl_num_masks(const struct flow_table *table) { - struct sw_flow_mask *mask; - int num = 0; - - list_for_each_entry(mask, &table->mask_list, list) - num++; - - return num; + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + return READ_ONCE(ma->count); } static struct table_instance *table_instance_expand(struct table_instance *ti, @@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti, return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } -/* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - if (mask) { - /* ovs-lock is required to protect mask-refcount and - * mask list. - */ - ASSERT_OVSL(); - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - kfree_rcu(mask, rcu); - } - } -} - /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { @@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - table->count--; - if (ovs_identifier_is_ufid(&flow->id)) { - hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - table->ufid_count--; - } - - /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be - * accessible as long as the RCU read lock is held. - */ - flow_mask_remove(table, flow->mask); + table_instance_flow_free(table, ti, ufid_ti, flow, true); } static struct sw_flow_mask *mask_alloc(void) @@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a, static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { - struct list_head *ml; + struct mask_array *ma; + int i; + + ma = ovsl_dereference(tbl->mask_array); + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *t; + t = ovsl_dereference(ma->masks[i]); - list_for_each(ml, &tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (mask_equal(mask, m)) - return m; + if (t && mask_equal(mask, t)) + return t; } return NULL; @@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exsits. */ @@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, return -ENOMEM; mask->key = new->key; mask->range = new->range; - list_add_rcu(&mask->list, &tbl->mask_list); + + /* Add mask to mask-list. */ + if (tbl_mask_array_add_mask(tbl, mask)) { + kfree(mask); + return -ENOMEM; + } } else { BUG_ON(!mask->ref_count); mask->ref_count++; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index bc52045b63ff..8a5cea6ae111 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -22,6 +22,17 @@ #include "flow.h" +struct mask_cache_entry { + u32 skb_hash; + u32 mask_index; +}; + +struct mask_array { + struct rcu_head rcu; + int count, max; + struct sw_flow_mask __rcu *masks[]; +}; + struct table_instance { struct hlist_head *buckets; unsigned int n_buckets; @@ -34,7 +45,8 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct list_head mask_list; + struct mask_cache_entry __percpu *mask_cache; + struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; unsigned int ufid_count; @@ -60,8 +72,9 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, - const struct sw_flow_key *, - u32 *n_mask_hit); + const struct sw_flow_key *, + u32 skb_hash, + u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3fc38d16c456..5da9392b03d6 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -403,8 +403,9 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) ids = rcu_dereference(vport->upcall_portids); - if (ids->n_ids == 1 && ids->ids[0] == 0) - return 0; + /* If there is only one portid, select it in the fast-path. */ + if (ids->n_ids == 1) + return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 82a50e850245..53c1d41fb1c9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1295,15 +1295,21 @@ static void packet_sock_destruct(struct sock *sk) static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - u32 rxhash; + u32 *history = po->rollover->history; + u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) - if (po->rollover->history[i] == rxhash) + if (READ_ONCE(history[i]) == rxhash) count++; - po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + victim = prandom_u32() % ROLLOVER_HLEN; + + /* Avoid dirtying the cache line if possible */ + if (READ_ONCE(history[victim]) != rxhash) + WRITE_ONCE(history[victim], rxhash); + return count > (ROLLOVER_HLEN >> 1); } diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index e35869e81766..15ce9b642b25 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -111,15 +111,11 @@ static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait) static int qrtr_tun_release(struct inode *inode, struct file *filp) { struct qrtr_tun *tun = filp->private_data; - struct sk_buff *skb; qrtr_endpoint_unregister(&tun->ep); /* Discard all SKBs */ - while (!skb_queue_empty(&tun->queue)) { - skb = skb_dequeue(&tun->queue); - kfree_skb(skb); - } + skb_queue_purge(&tun->queue); kfree(tun); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9de2ae22d583..3fd5f40189bd 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> @@ -107,6 +108,7 @@ static void rds_ib_dev_free(struct work_struct *work) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); + dma_pool_destroy(rds_ibdev->rid_hdrs_pool); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); @@ -182,6 +184,12 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->pd = NULL; goto put_dev; } + rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name, + device->dma_device, + sizeof(struct rds_header), + L1_CACHE_BYTES, 0); + if (!rds_ibdev->rid_hdrs_pool) + goto put_dev; rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); diff --git a/net/rds/ib.h b/net/rds/ib.h index f2b558e8b5ea..6e6f24753998 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -165,8 +165,8 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; struct rm_data_op *i_data_op; - struct rds_header *i_send_hdrs; - dma_addr_t i_send_hdrs_dma; + struct rds_header **i_send_hdrs; + dma_addr_t *i_send_hdrs_dma; struct rds_ib_send_work *i_sends; atomic_t i_signaled_sends; @@ -175,8 +175,8 @@ struct rds_ib_connection { struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - dma_addr_t i_recv_hdrs_dma; + struct rds_header **i_recv_hdrs; + dma_addr_t *i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; @@ -246,6 +246,7 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; + struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ bool use_fastreg; unsigned int max_mrs; @@ -381,7 +382,11 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); - +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs); +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 18c6fac6ead9..c71f4328d138 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> @@ -439,6 +440,68 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) rds_ibdev->vector_load[index]--; } +/* Allocate DMA coherent memory to be used to store struct rds_header for + * sending/receiving packets. The pointers to the DMA memory and the + * associated DMA addresses are stored in two arrays. + * + * @ibdev: the IB device + * @pool: the DMA memory pool + * @dma_addrs: pointer to the array for storing DMA addresses + * @num_hdrs: number of headers to allocate + * + * It returns the pointer to the array storing the DMA memory pointers. On + * error, NULL pointer is returned. + */ +struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, + struct dma_pool *pool, + dma_addr_t **dma_addrs, u32 num_hdrs) +{ + struct rds_header **hdrs; + dma_addr_t *hdr_daddrs; + u32 i; + + hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdrs) + return NULL; + + hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL, + ibdev_to_node(ibdev)); + if (!hdr_daddrs) { + kvfree(hdrs); + return NULL; + } + + for (i = 0; i < num_hdrs; i++) { + hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]); + if (!hdrs[i]) { + rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i); + return NULL; + } + } + + *dma_addrs = hdr_daddrs; + return hdrs; +} + +/* Free the DMA memory used to store struct rds_header. + * + * @pool: the DMA memory pool + * @hdrs: pointer to the array storing DMA memory pointers + * @dma_addrs: pointer to the array storing DMA addresses + * @num_hdars: number of headers to free. + */ +void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, + dma_addr_t *dma_addrs, u32 num_hdrs) +{ + u32 i; + + for (i = 0; i < num_hdrs; i++) + dma_pool_free(pool, hdrs[i], dma_addrs[i]); + kvfree(hdrs); + kvfree(dma_addrs); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -452,6 +515,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; unsigned long max_wrs; int ret, fr_queue_space; + struct dma_pool *pool; /* * It's normal to see a null device if an incoming connection races @@ -547,31 +611,28 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_cq_out; } - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); + pool = rds_ibdev->rid_hdrs_pool; + ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); if (!ic->i_send_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); + rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out; } - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); + ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); if (!ic->i_recv_hdrs) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); + rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out; } - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); + ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL, + &ic->i_ack_dma); if (!ic->i_ack) { ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); + rdsdebug("DMA ack header alloc failed\n"); goto recv_hdrs_dma_out; } @@ -602,17 +663,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn) sends_out: vfree(ic->i_sends); + ack_dma_out: - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + recv_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, ic->i_recv_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + send_hdrs_dma_out: - ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, ic->i_send_hdrs_dma); + rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + qp_out: rdma_destroy_qp(ic->i_cm_id); recv_cq_out: @@ -990,8 +1057,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); err = rdma_disconnect(ic->i_cm_id); if (err) { @@ -1041,24 +1106,39 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ib_destroy_cq(ic->i_recv_cq); } - /* then free the resources that ib callbacks use */ - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); + if (ic->rds_ibdev) { + struct dma_pool *pool; + + pool = ic->rds_ibdev->rid_hdrs_pool; + + /* then free the resources that ib callbacks use */ + if (ic->i_send_hdrs) { + rds_dma_hdrs_free(pool, ic->i_send_hdrs, + ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr); + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + } + + if (ic->i_recv_hdrs) { + rds_dma_hdrs_free(pool, ic->i_recv_hdrs, + ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + } + + if (ic->i_ack) { + dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + ic->i_ack = NULL; + } + } else { + WARN_ON(ic->i_send_hdrs); + WARN_ON(ic->i_send_hdrs_dma); + WARN_ON(ic->i_recv_hdrs); + WARN_ON(ic->i_recv_hdrs_dma); + WARN_ON(ic->i_ack); + } if (ic->i_sends) rds_ib_send_clear_ring(ic); @@ -1077,9 +1157,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_pd = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; } BUG_ON(ic->rds_ibdev); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a0f99bbf362c..694d411dc72f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -61,7 +61,7 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.num_sge = RDS_IB_RECV_SGE; sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -343,7 +343,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, WARN_ON(ret != 1); sge = &recv->r_sge[0]; - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs]; sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; @@ -861,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -993,10 +993,11 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), + wc->vendor_err); } /* rds_ib_process_recv() doesn't always consume the frag, and diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dfe6237dafe2..d1cc1d7778d8 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -201,7 +201,8 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->addr = ic->i_send_hdrs_dma[i]; + sge->length = sizeof(struct rds_header); sge->lkey = ic->i_pd->local_dma_lkey; @@ -300,10 +301,10 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, - ib_wc_status_msg(wc->status)); + ib_wc_status_msg(wc->status), wc->vendor_err); } } @@ -631,11 +632,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_queued = jiffies; send->s_op = NULL; - send->s_sge[0].addr = ic->i_send_hdrs_dma - + (pos * sizeof(struct rds_header)); + send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; + send->s_sge[0].length = sizeof(struct rds_header); - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, + sizeof(struct rds_header)); + /* Set up the data, if present */ if (i < work_alloc @@ -674,7 +677,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); if (ic->i_flowctl && adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; + struct rds_header *hdr = ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ hdr->h_credit = adv_credits; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 6a0df7c8a939..46b8ff24020d 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -906,7 +906,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); out_release: release_sock(sk); @@ -1011,7 +1011,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); rose_insert_socket(make); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 64830d8c1fdb..452163eadb98 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -209,6 +209,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { + const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); @@ -230,6 +231,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } _leave(" = %p", peer); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 69d4676a402f..7fc1e2c1b656 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_PKT64 */ + + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ @@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats) + int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; + p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, @@ -451,6 +454,17 @@ err1: } EXPORT_SYMBOL(tcf_idr_create); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags) +{ + /* Set cpustats according to actions flags. */ + return tcf_idr_create(tn, index, est, a, ops, bind, + !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); +} +EXPORT_SYMBOL(tcf_idr_create_from_flags); + void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + if (a->tcfa_flags) { + struct nla_bitfield32 flags = { a->tcfa_flags, + a->tcfa_flags, }; + + if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) + goto nla_put_failure; + } + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -831,12 +853,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, + [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_flags_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -845,6 +870,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { + struct nla_bitfield32 flags = { 0, 0 }; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -876,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); @@ -914,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, extack); + rtnl_held, tp, flags.value, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, extack); + tp, flags.value, extack); if (err < 0) goto err_mod; @@ -975,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err = PTR_ERR(act); goto err; } - act->order = i; sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; @@ -989,6 +1016,29 @@ err: return err; } +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw) +{ + if (a->cpu_bstats) { + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (drop) + this_cpu_ptr(a->cpu_qstats)->drops += packets; + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + return; + } + + _bstats_update(&a->tcfa_bstats, bytes, packets); + if (drop) + a->tcfa_qstats.drops += packets; + if (hw) + _bstats_update(&a->tcfa_bstats_hw, bytes, packets); +} +EXPORT_SYMBOL(tcf_action_update_stats); + int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 04b7bd4ec751..46f47e58b3be 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, - &act_bpf_ops, bind, true); + &act_bpf_ops, bind, true, 0); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2b43cacf82af..43a243081e7d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false); + &act_connmark_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d3cfad88dc3a..16e67e1c1db1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_new; @@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_csum_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_csum_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); - bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); + tcf_action_update_bstats(&p->common, skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) @@ -624,7 +624,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&p->common); action = TC_ACT_SHOT; goto out; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index fcc46025e790..c13638aeef46 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -465,16 +465,15 @@ out_push: skb_push_rcsum(skb, nh_ofs); out: - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + tcf_action_update_bstats(&c->common, skb); return retval; drop: - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + tcf_action_inc_drop_qstats(&c->common); return TC_ACT_SHOT; } static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { - [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 }, [TCA_CT_ACTION] = { .type = NLA_U16 }, [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, [TCA_CT_ZONE] = { .type = NLA_U16 }, @@ -656,7 +655,7 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int replace, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); @@ -688,8 +687,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return err; if (!err) { - err = tcf_idr_create(tn, index, est, a, - &act_ct_ops, bind, true); + err = tcf_idr_create_from_flags(tn, index, est, a, + &act_ct_ops, bind, flags); if (err) { tcf_idr_cleanup(tn, index); return err; @@ -905,11 +904,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, { struct tcf_ct *c = to_ct(a); - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 0dbcfd1dca7b..b1e601007242 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); @@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false); + &act_ctinfo_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 324f1d1f6d47..416065772719 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gact_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gact_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, action = gact_rand[ptype](gact); } #endif - bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); + tcf_action_update_bstats(&gact->common, skb); if (action == TC_ACT_SHOT) - qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&gact->common); tcf_lastuse_update(&gact->tcf_tm); @@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, - packets); - if (action == TC_ACT_SHOT) - this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - - if (hw) - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), - bytes, packets); - + tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3a31e241c647..d562c88cccbe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, - bind, true); + bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 214a03d405cf..400a2cfe8452 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -95,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind, - struct tcf_proto *tp) + struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -144,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, - false); + false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -205,19 +205,19 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool unlocked, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp); + bind, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 08923b21e566..b6e1b5bbb4da 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } - ret = tcf_idr_create(tn, index, est, a, - &act_mirred_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -231,7 +231,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, } tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + tcf_action_update_bstats(&m->common, skb); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); @@ -289,8 +289,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - res->qstats = this_cpu_ptr(m->common.cpu_qstats); - skb_tc_reinsert(skb, res); + if (skb_tc_reinsert(skb, res)) + tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } @@ -303,7 +303,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (err) { out: - qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); + tcf_action_inc_overlimit_qstats(&m->common); if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } @@ -318,10 +318,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 4cf6c553bb0b..c7d5e12ee919 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -119,7 +119,6 @@ static int valid_label(const struct nlattr *attr, } static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { - [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), @@ -131,7 +130,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); struct nlattr *tb[TCA_MPLS_MAX + 1]; @@ -224,7 +224,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_mpls_ops, bind, true); + &act_mpls_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index ea4c5359e7df..855a6fa16a62 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false); + &act_nat_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, icmph = (void *)(skb_network_header(skb) + ihl); - if ((icmph->type != ICMP_DEST_UNREACH) && - (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) + if (!icmp_is_err(icmph->type)) break; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b5bc631b96b7..3ad718576304 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -137,7 +137,8 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; @@ -188,7 +189,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false); + &act_pedit_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); goto out_free; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 89c04c52af3d..d96271590268 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -47,7 +47,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; @@ -87,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, - &act_police_ops, bind, true); + &act_police_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -294,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a, struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -345,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(police->tcf_tm.expires); + tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 514456a0b9a8..29b23bfaf10d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_sample_ops, bind, true); + &act_sample_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6120e56117ca..9813ca4006dd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ - pr_info("simple: %s_%d\n", + pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_simp_ops, bind, false); + &act_simp_ops, bind, false, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6a8d3337c577..5f7ca7f89ca2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbedit_ops, bind, true); + &act_skbedit_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 888437f97ba6..39e6d94cfafb 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); @@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, index, est, a, - &act_skbmod_ops, bind, true); + &act_skbmod_ops, bind, true, 0); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d55669e14741..6379f9568ab8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -10,6 +10,8 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -31,7 +33,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); - bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { @@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, static const struct nla_policy enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN }, [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static int tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) @@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, return opt_len; } +static int +tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (dst) { + struct vxlan_metadata *md = dst; + + md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + } + + return sizeof(struct vxlan_metadata); +} + +static int +tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + } else if (ver == 2) { + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + if (dst) { + struct erspan_metadata *md = dst; + + md->version = ver; + if (ver == 1) { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } else { + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } + + return sizeof(struct erspan_metadata); +} + static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int dst_len, struct netlink_ext_ack *extack) { - int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0; const struct nlattr *attr, *head = nla_data(nla); err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, @@ -130,6 +228,10 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } opt_len = tunnel_key_copy_geneve_opt(attr, dst, dst_len, extack); if (opt_len < 0) @@ -143,6 +245,31 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, dst_len -= opt_len; dst += opt_len; } + type = TUNNEL_GENEVE_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_vxlan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: + if (type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + opt_len = tunnel_key_copy_erspan_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; break; } } @@ -179,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #else return -EAFNOSUPPORT; #endif + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif default: NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); return -EINVAL; @@ -212,7 +355,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, + struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); @@ -351,8 +494,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_tunnel_key_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_tunnel_key_ops, bind, + act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; @@ -454,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return 0; } +static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, start); + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); + if (!start) + return -EMSGSIZE; + + if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, start); + return 0; +err: + nla_nest_cancel(skb, start); + return -EMSGSIZE; +} + static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -471,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + err = tunnel_key_vxlan_opts_dump(skb, info); + if (err) + goto err_out; + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + err = tunnel_key_erspan_opts_dump(skb, info); + if (err) + goto err_out; } else { err_out: nla_nest_cancel(skb, start); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 08aaf719a70f..b6939abc61eb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, u16 tci; tcf_lastuse_update(&v->tcf_tm); - bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); + tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -88,7 +88,7 @@ out: return action; drop: - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } @@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct tcf_proto *tp, struct netlink_ext_ack *extack) + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_vlan_ops, bind, true); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -307,10 +308,7 @@ static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + tcf_action_update_stats(a, bytes, packets, false, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 74221e3351c3..c307ee1d6ca6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -22,6 +22,8 @@ #include <net/ip.h> #include <net/flow_dissector.h> #include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -688,7 +690,11 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { static const struct nla_policy enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = { + .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN }, [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -699,6 +705,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { .len = 128 }, }; +static const struct nla_policy +vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -928,6 +947,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(struct geneve_opt) + data_len; } +static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1]; + struct vxlan_metadata *md; + int err; + + md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) { + NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla, + vxlan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) + md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + + return sizeof(*md); +} + +static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1]; + struct erspan_metadata *md; + int err; + + md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + md->version = 1; + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) { + NL_SET_ERR_MSG(extack, "Non-erspan option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla, + erspan_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) + md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]); + + if (md->version == 1) { + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(nla); + } + } else if (md->version == 2) { + if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] || + !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) { + NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); + return -EINVAL; + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(nla); + } + if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) { + nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(nla)); + } + } else { + NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); + return -EINVAL; + } + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -958,6 +1076,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) { switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: + if (key->enc_opts.dst_opt_type && + key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); + return -EINVAL; + } option_len = 0; key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; option_len = fl_set_geneve_opt(nla_opt_key, key, @@ -986,6 +1109,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, if (msk_depth) nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; + case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + option_len = fl_set_vxlan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; + case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + option_len = fl_set_erspan_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -2135,6 +2324,61 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_vxlan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN); + if (!nest) + goto nla_put_failure; + + md = (struct vxlan_metadata *)&enc_opts->data[0]; + if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_erspan_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN); + if (!nest) + goto nla_put_failure; + + md = (struct erspan_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version)) + goto nla_put_failure; + + if (md->version == 1 && + nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) + goto nla_put_failure; + + if (md->version == 2 && + (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, + md->u.md2.dir) || + nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -2188,6 +2432,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, if (err) goto nla_put_failure; break; + case TUNNEL_VXLAN_OPT: + err = fl_dump_key_vxlan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; + case TUNNEL_ERSPAN_OPT: + err = fl_dump_key_erspan_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3177dcb17316..d99966a55c84 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl) *err = -1; return; } - dst->value = sk->sk_ack_backlog; + dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) @@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl) *err = -1; return; } - dst->value = sk->sk_max_ack_backlog; + dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 98dd87ce1510..b1c7e726ce5d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -530,8 +530,7 @@ begin: fq_flow_set_throttled(q, f); goto begin; } - if (time_next_packet && - (s64)(now - time_next_packet - q->ce_threshold) > 0) { + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index c261c0a18868..968519ff36e9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -14,7 +14,6 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8769b4b8807d..5ab696efca95 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -382,13 +382,8 @@ void __qdisc_run(struct Qdisc *q) int packets; while (qdisc_restart(q, &packets)) { - /* - * Ordered by possible occurrence: Postpone processing if - * 1. we've exceeded packet quota - * 2. another process needs the CPU; - */ quota -= packets; - if (quota <= 0 || need_resched()) { + if (quota <= 0) { __netif_schedule(q); break; } @@ -657,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else { - qdisc->empty = true; + WRITE_ONCE(qdisc->empty, true); } return skb; @@ -1214,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head) /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { - while (some_qdisc_is_busy(dev)) - yield(); + while (some_qdisc_is_busy(dev)) { + /* wait_event() would avoid this sleep-loop but would + * require expensive checks in the fast paths of packet + * processing which isn't worth it. + */ + schedule_timeout_uninterruptible(1); + } /* The new qdisc is assigned at this point so we can safely * unwind stale skb lists and qdisc statistics */ diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index df98a887eb89..b0b0dc46af61 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -22,6 +22,7 @@ #define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 +#define DTIME_INVALID 0xffffffffffffffff #define MAX_PROB 0xffffffffffffffff #define PIE_SCALE 8 @@ -34,6 +35,7 @@ struct pie_params { u32 beta; /* and are used for shift relative to 1 */ bool ecn; /* true if ecn is enabled */ bool bytemode; /* to scale drop early prob based on pkt size */ + u8 dq_rate_estimator; /* to calculate delay using Little's law */ }; /* variables used */ @@ -77,11 +79,34 @@ static void pie_params_init(struct pie_params *params) params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->ecn = false; params->bytemode = false; + params->dq_rate_estimator = false; +} + +/* private skb vars */ +struct pie_skb_cb { + psched_time_t enqueue_time; +}; + +static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); + return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) +{ + return get_pie_cb(skb)->enqueue_time; +} + +static void pie_set_enqueue_time(struct sk_buff *skb) +{ + get_pie_cb(skb)->enqueue_time = psched_get_time(); } static void pie_vars_init(struct pie_vars *vars) { vars->dq_count = DQCOUNT_INVALID; + vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->avg_dq_rate = 0; /* default of 150 ms in pschedtime */ @@ -172,6 +197,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* we can enqueue the packet */ if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->params.dq_rate_estimator) + pie_set_enqueue_time(skb); + q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); @@ -194,6 +223,7 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_BETA] = {.type = NLA_U32}, [TCA_PIE_ECN] = {.type = NLA_U32}, [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -247,6 +277,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_PIE_BYTEMODE]) q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) + q->params.dq_rate_estimator = + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { @@ -266,6 +300,28 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); int qlen = sch->qstats.backlog; /* current queue size in bytes */ + psched_time_t now = psched_get_time(); + u32 dtime = 0; + + /* If dq_rate_estimator is disabled, calculate qdelay using the + * packet timestamp. + */ + if (!q->params.dq_rate_estimator) { + q->vars.qdelay = now - pie_get_enqueue_time(skb); + + if (q->vars.dq_tstamp != DTIME_INVALID) + dtime = now - q->vars.dq_tstamp; + + q->vars.dq_tstamp = now; + + if (qlen == 0) + q->vars.qdelay = 0; + + if (dtime == 0) + return; + + goto burst_allowance_reduction; + } /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save @@ -289,10 +345,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) q->vars.dq_count += skb->len; if (q->vars.dq_count >= QUEUE_THRESHOLD) { - psched_time_t now = psched_get_time(); - u32 dtime = now - q->vars.dq_tstamp; u32 count = q->vars.dq_count << PIE_SCALE; + dtime = now - q->vars.dq_tstamp; + if (dtime == 0) return; @@ -317,14 +373,19 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) q->vars.dq_tstamp = psched_get_time(); } - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; - else - q->vars.burst_time = 0; - } + goto burst_allowance_reduction; } } + + return; + +burst_allowance_reduction: + if (q->vars.burst_time > 0) { + if (q->vars.burst_time > dtime) + q->vars.burst_time -= dtime; + else + q->vars.burst_time = 0; + } } static void calculate_probability(struct Qdisc *sch) @@ -332,19 +393,25 @@ static void calculate_probability(struct Qdisc *sch) struct pie_sched_data *q = qdisc_priv(sch); u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ - psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ u64 oldprob; u64 alpha, beta; u32 power; bool update_prob = true; - q->vars.qdelay_old = q->vars.qdelay; + if (q->params.dq_rate_estimator) { + qdelay_old = q->vars.qdelay; + q->vars.qdelay_old = q->vars.qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; - else - qdelay = 0; + if (q->vars.avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + else + qdelay = 0; + } else { + qdelay = q->vars.qdelay; + qdelay_old = q->vars.qdelay_old; + } /* If qdelay is zero and qlen is not, it means qlen is very small, less * than dequeue_rate, so we do not update probabilty in this round @@ -430,14 +497,18 @@ static void calculate_probability(struct Qdisc *sch) /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero - * 3. We have atleast one estimate for the avg_dq_rate ie., - * is a non-zero value + * 3. If average dq_rate_estimator is enabled, we have atleast one + * estimate for the avg_dq_rate ie., is a non-zero value */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.qdelay_old < q->params.target / 2) && q->vars.prob == 0 && - q->vars.avg_dq_rate > 0) + (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) { pie_vars_init(&q->vars); + } + + if (!q->params.dq_rate_estimator) + q->vars.qdelay_old = qdelay; } static void pie_timer(struct timer_list *t) @@ -497,7 +568,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, + q->params.dq_rate_estimator)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -514,9 +587,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .prob = q->vars.prob, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, - /* unscale and return dq_rate in bytes per sec */ - .avg_dq_rate = q->vars.avg_dq_rate * - (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, @@ -524,6 +594,14 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ecn_mark = q->stats.ecn_mark, }; + /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ + st.dq_rate_estimating = q->params.dq_rate_estimator; + + /* unscale and return dq_rate in bytes per sec */ + if (q->params.dq_rate_estimator) + st.avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; + return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index d2ffc9a0ba3a..bbd5004a5d09 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -64,6 +64,7 @@ static struct sctp_association *sctp_association_init( /* Discarding const is appropriate here. */ asoc->ep = (struct sctp_endpoint *)ep; asoc->base.sk = (struct sock *)sk; + asoc->base.net = sock_net(sk); sctp_endpoint_hold(asoc->ep); sock_hold(asoc->base.sk); @@ -86,6 +87,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; + asoc->ps_retrans = sp->ps_retrans; + asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); @@ -324,7 +327,7 @@ void sctp_association_free(struct sctp_association *asoc) * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is @@ -429,6 +432,8 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; + sctp_ulpevent_nofity_peer_addr_change(transport, + SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, @@ -569,6 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -624,6 +630,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; + /* And the primary path switchover retrans threshold */ + peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. @@ -707,6 +715,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; + sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); @@ -781,9 +791,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error) { - struct sctp_ulpevent *event; - struct sockaddr_storage addr; - int spc_state = 0; + int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ @@ -793,19 +801,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ - if (SCTP_UNCONFIRMED == transport->state && - SCTP_HEARTBEAT_SUCCESS == error) - spc_state = SCTP_ADDR_CONFIRMED; - else - spc_state = SCTP_ADDR_AVAILABLE; - /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1 MTU, see SCTP - * Quick failover draft section 5.1, point 5 - */ - if (transport->state == SCTP_PF) { + if (transport->state == SCTP_PF && + asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; - transport->cwnd = asoc->pathmtu; - } + else if (transport->state == SCTP_UNCONFIRMED && + error == SCTP_HEARTBEAT_SUCCESS) + spc_state = SCTP_ADDR_CONFIRMED; + transport->state = SCTP_ACTIVE; break; @@ -814,19 +816,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to inactive state. Also, release the cached route since * there may be a better route next time. */ - if (transport->state != SCTP_UNCONFIRMED) + if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; - else { + spc_state = SCTP_ADDR_UNREACHABLE; + } else { sctp_transport_dst_release(transport); ulp_notify = false; } - - spc_state = SCTP_ADDR_UNREACHABLE; break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; - ulp_notify = false; + if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) + ulp_notify = false; + else + spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: @@ -836,16 +840,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ - if (ulp_notify) { - memset(&addr, 0, sizeof(struct sockaddr_storage)); - memcpy(&addr, &transport->ipaddr, - transport->af_specific->sockaddr_len); - - event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, - 0, spc_state, error, GFP_ATOMIC); - if (event) - asoc->stream.si->enqueue_event(&asoc->ulpq, event); - } + if (ulp_notify) + sctp_ulpevent_nofity_peer_addr_change(transport, + spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); @@ -1077,7 +1074,7 @@ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) - oldsk->sk_ack_backlog--; + sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc0405c79dfc..cc3ce5d80b08 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; - int error = 0, notify; - - /* If we failed, we may need to notify. */ - notify = msg->send_failed ? -1 : 0; + int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); - /* Check whether we _really_ need to notify. */ - if (notify < 0) { - asoc = chunk->asoc; - if (msg->send_error) - error = msg->send_error; - else - error = asoc->outqueue.error; - - notify = sctp_ulpevent_type_enabled(asoc->subscribe, - SCTP_SEND_FAILED); + + if (!msg->send_failed) { + sctp_chunk_put(chunk); + continue; } - /* Generate a SEND FAILED event only if enabled. */ - if (notify > 0) { - int sent; - if (chunk->has_tsn) - sent = SCTP_DATA_SENT; - else - sent = SCTP_DATA_UNSENT; + asoc = chunk->asoc; + error = msg->send_error ?: asoc->outqueue.error; + sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } + if (sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED_EVENT)) { + ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, + sent, error, + GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + sctp_chunk_put(chunk); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 0851166b9175..8a15146faaeb 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -425,8 +425,8 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index ea53049d1db6..3ccab7440c9e 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -110,6 +110,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Remember who we are attached to. */ ep->base.sk = sk; + ep->base.net = sock_net(sk); sock_hold(ep->base.sk); return ep; @@ -164,7 +165,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) - sk->sk_ack_backlog++; + sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until diff --git a/net/sctp/input.c b/net/sctp/input.c index 2277981559d0..4d2bcfc9d7f8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -882,7 +882,7 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, if (!sctp_transport_hold(t)) return err; - if (!net_eq(sock_net(t->asoc->base.sk), x->net)) + if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; @@ -897,7 +897,7 @@ static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; - return sctp_hashfn(sock_net(t->asoc->base.sk), + return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 08d14d86ecfb..fbbf19128c2d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1217,9 +1217,15 @@ static int __net_init sctp_defaults_init(struct net *net) /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; + /* Disable of Primary Path Switchover by default */ + net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; + /* Enable pf state by default */ net->sctp.pf_enable = 1; + /* Ignore pf exposure feature by default */ + net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e52b2128e43b..acd737d4c0e0 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -567,6 +567,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, SCTP_FAILED_THRESHOLD); } + if (transport->error_count > transport->ps_retrans && + asoc->peer.primary_path == transport && + asoc->peer.active_path != transport) + sctp_assoc_set_primary(asoc, asoc->peer.active_path); + /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 0c21c52fc408..4ab8208a2dd4 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2160,8 +2160,10 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( /* Update socket peer label if first association. */ if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) + chunk->skb)) { + sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } /* Set temp so that it won't be added into hashtable */ new_asoc->temp = 1; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ffd3262b7a41..83e4ca1fabda 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3943,18 +3943,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, char __user *optval, - unsigned int optlen) + unsigned int optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int len; - if (optlen < sizeof(struct sctp_paddrthlds)) + len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (optlen < len) return -EINVAL; - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, - sizeof(struct sctp_paddrthlds))) + if (copy_from_user(&val, optval, len)) return -EFAULT; + if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + return -EINVAL; + if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); @@ -3963,6 +3967,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; return 0; @@ -3978,17 +3984,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, transports) { if (val.spt_pathmaxrxt) trans->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + trans->ps_retrans = val.spt_pathcpthld; trans->pf_retrans = val.spt_pathpfthld; } if (val.spt_pathmaxrxt) asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + asoc->ps_retrans = val.spt_pathcpthld; asoc->pf_retrans = val.spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val.spt_pathmaxrxt) sp->pathmaxrxt = val.spt_pathmaxrxt; + if (v2) + sp->ps_retrans = val.spt_pathcpthld; sp->pf_retrans = val.spt_pathpfthld; } @@ -4589,6 +4601,40 @@ out: return retval; } +static int sctp_setsockopt_pf_expose(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + goto out; + + if (asoc) + asoc->pf_expose = params.assoc_value; + else + sctp_sk(sk)->pf_expose = params.assoc_value; + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4744,7 +4790,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); @@ -4798,6 +4849,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5041,6 +5095,8 @@ static int sctp_init_sock(struct sock *sk) sp->hbinterval = net->sctp.hb_interval; sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; + sp->ps_retrans = net->sctp.ps_retrans; + sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; @@ -5521,8 +5577,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); - if (!transport) - return -EINVAL; + if (!transport) { + retval = -EINVAL; + goto out; + } + + if (transport->state == SCTP_PF && + transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { + retval = -EACCES; + goto out; + } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; @@ -7170,18 +7234,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, - int len, - int __user *optlen) + char __user *optval, int len, + int __user *optlen, bool v2) { - struct sctp_paddrthlds val; + struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; + int min; - if (len < sizeof(struct sctp_paddrthlds)) + min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + if (len < min) return -EINVAL; - len = sizeof(struct sctp_paddrthlds); - if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len)) + len = min; + if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { @@ -7192,6 +7257,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; + val.spt_pathcpthld = trans->ps_retrans; goto out; } @@ -7204,11 +7270,13 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; + val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; + val.spt_pathcpthld = sp->ps_retrans; } out: @@ -7900,6 +7968,45 @@ out: return retval; } +static int sctp_getsockopt_pf_expose(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = asoc ? asoc->pf_expose + : sctp_sk(sk)->pf_expose; + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8049,7 +8156,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen); + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, false); + break; + case SCTP_PEER_ADDR_THLDS_V2: + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, + optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); @@ -8112,6 +8224,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_ECN_SUPPORTED: retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); break; + case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: + retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -8376,7 +8491,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } } - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); return sctp_hash_endpoint(ep); } @@ -8430,7 +8545,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 238cf1737576..4740aa70e652 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ static int rto_alpha_min = 0; static int rto_beta_min = 0; static int rto_alpha_max = 1000; static int rto_beta_max = 1000; +static int pf_expose_max = SCTP_PF_EXPOSE_MAX; +static int ps_retrans_max = SCTP_PS_RETRANS_MAX; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -212,7 +214,16 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, + .extra2 = &init_net.sctp.ps_retrans, + }, + { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, }, { .procname = "sndbuf_policy", @@ -318,6 +329,15 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pf_expose", + .data = &init_net.sctp.pf_expose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &pf_expose_max, + }, { /* sentinel */ } }; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e0cc1edf49a0..c82dbdcf13f2 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -238,7 +238,7 @@ fail: * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( +static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) @@ -336,6 +336,22 @@ fail: return NULL; } +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error) +{ + struct sctp_association *asoc = transport->asoc; + struct sockaddr_storage addr; + struct sctp_ulpevent *event; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, + error, GFP_ATOMIC); + if (event) + asoc->stream.si->enqueue_event(&asoc->ulpq, event); +} + /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the @@ -511,6 +527,45 @@ fail: return NULL; } +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, gfp_t gfp) +{ + struct sctp_send_failed_event *ssf; + struct sctp_ulpevent *event; + struct sk_buff *skb; + int len; + + skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); + if (!skb) + return NULL; + + len = ntohs(chunk->chunk_hdr->length); + len -= sctp_datachk_len(&asoc->stream); + + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + ssf = skb_push(skb, sizeof(*ssf)); + ssf->ssf_type = SCTP_SEND_FAILED_EVENT; + ssf->ssf_flags = flags; + ssf->ssf_length = sizeof(*ssf) + len; + skb_trim(skb, ssf->ssf_length); + ssf->ssf_error = error; + + ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; + ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; + ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; + ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; + ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; + + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + + return event; +} + /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 737b49909a7a..b997072c72e5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -25,6 +25,7 @@ #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> #include <net/sock.h> #include <net/tcp.h> @@ -174,6 +175,7 @@ static int smc_release(struct socket *sock) if (!sk) goto out; + sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ @@ -196,6 +198,7 @@ static int smc_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -978,12 +981,14 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; __smc_release(smc); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } @@ -2035,22 +2040,28 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_pnet; + } + rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto6, 1); @@ -2082,6 +2093,8 @@ out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); +out_core: + smc_core_exit(); out_pnet: smc_pnet_exit(); out_pernet_subsys: @@ -2092,14 +2105,15 @@ out_pernet_subsys: static void __exit smc_exit(void) { - smc_core_exit(); static_branch_disable(&tcp_have_smc); - smc_ib_unregister_client(); sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); } module_init(smc_init); diff --git a/net/smc/smc.h b/net/smc/smc.h index 878313f8d6c1..be11ba41190f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,6 +188,7 @@ struct smc_connection { * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..164f1584861b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (!conn->alert_token_local) + if (conn->killed) /* abnormal termination */ rc = -EPIPE; return rc; @@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; + if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + return -EPIPE; + if (conn->lgr->is_smcd) { spin_lock_bh(&conn->send_lock); rc = smcd_cdc_msg_send(conn); @@ -328,7 +331,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) struct smcd_cdc_msg cdc; struct smc_sock *smc; - if (!conn) + if (!conn || conn->killed) return; data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 49bcebff6378..0879f7bed967 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; - smc_lgr_terminate(smc->conn.lgr); + smc_lgr_terminate(smc->conn.lgr, true); } } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fc06720b53c1..290270c821ca 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -13,14 +13,13 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_tx.h" #include "smc_cdc.h" #include "smc_close.h" -#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) - /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { @@ -65,8 +64,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - (sk->sk_err == ECONNABORTED) || - (sk->sk_err == ECONNRESET), + sk->sk_err == ECONNABORTED || + sk->sk_err == ECONNRESET || + smc->conn.killed, &wait); if (rc) break; @@ -95,68 +95,73 @@ static int smc_close_final(struct smc_connection *conn) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } -static int smc_close_abort(struct smc_connection *conn) +int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return smc_cdc_get_slot_and_msg_send(conn); } +static void smc_close_cancel_work(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + + release_sock(sk); + cancel_work_sync(&smc->conn.close_work); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + sk->sk_state = SMC_CLOSED; +} + /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ -static void smc_close_active_abort(struct smc_sock *smc) +void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; - - struct smc_cdc_conn_state_flags *txflags = - &smc->conn.local_tx_ctrl.conn_state_flags; + bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); - } + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - if (!smc_cdc_rxed_any_close(&smc->conn)) - sk->sk_state = SMC_PEERABORTWAIT; - else - sk->sk_state = SMC_CLOSED; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: - if (!txflags->peer_conn_closed) { - /* just SHUTDOWN_SEND done */ - sk->sk_state = SMC_PEERABORTWAIT; - } else { - sk->sk_state = SMC_CLOSED; - } + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; - break; - case SMC_PEERFINCLOSEWAIT: - sock_put(sk); /* passive closing */ + smc_conn_free(&smc->conn); + release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: @@ -166,6 +171,12 @@ static void smc_close_active_abort(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -215,8 +226,6 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -229,8 +238,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } sk->sk_state = SMC_CLOSED; break; @@ -246,8 +253,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_final(conn); - if (rc) - break; if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; @@ -263,8 +268,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -272,10 +275,12 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - smc_close_abort(conn); + rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; @@ -344,12 +349,6 @@ static void smc_close_passive_work(struct work_struct *work) lock_sock(sk); old_state = sk->sk_state; - if (!conn->alert_token_local) { - /* abnormal termination */ - smc_close_active_abort(smc); - goto wakeup; - } - rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ @@ -451,8 +450,6 @@ again: goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: @@ -466,8 +463,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index e0e3b5df25d2..634fea2b7c95 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -24,5 +24,7 @@ int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2ba97ff325a5..bb92c7c6214c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -13,6 +13,8 @@ #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/reboot.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -39,23 +41,46 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +static atomic_t lgr_cnt; /* number of existing link groups */ +static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - mod_delayed_work(system_wq, &lgr->free_work, - (!lgr->is_smcd && lgr->role == SMC_CLNT) ? - SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); + if (!lgr->freeing && !lgr->freefast) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { - mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); + if (!lgr->freeing && !lgr->freefast) { + lgr->freefast = 1; + mod_delayed_work(system_wq, &lgr->free_work, + SMC_LGR_FREE_DELAY_FAST); + } } /* Register connection's alert token in our lookup structure. @@ -134,16 +159,17 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk) +static int smc_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { smc_llc_link_deleting(lnk); return 0; } @@ -157,48 +183,62 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); + spinlock_t *lgr_lock; + struct smc_link *lnk; bool conns; - spin_lock_bh(&smc_lgr_list.lock); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); return; } - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); /* remove from smc_lgr_list */ - spin_unlock_bh(&smc_lgr_list.lock); + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk)) { + !smc_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); + spin_unlock_bh(lgr_lock); return; } } + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); + + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + if (lgr->is_smcd && !lgr->terminating) + smc_ism_signal_shutdown(lgr); + smc_lgr_free(lgr); +} - if (!delayed_work_pending(&lgr->free_work)) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); - } + smc_lgr_terminate(lgr, true); } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; + struct list_head *lgr_list; struct smc_link *lnk; + spinlock_t *lgr_lock; u8 rndvec[3]; int rc = 0; int i; @@ -217,6 +257,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; + lgr->terminating = 0; + lgr->freefast = 0; + lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); @@ -228,13 +271,20 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ + get_device(&ini->ism_dev->dev); lgr->peer_gid = ini->ism_gid; lgr->smcd = ini->ism_dev; + lgr_list = &ini->ism_dev->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; + lgr->peer_shutdown = 0; + atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ + get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); @@ -245,6 +295,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; if (!ini->ib_dev->initialized) @@ -272,11 +324,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; + atomic_inc(&lgr_cnt); + atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; - spin_lock_bh(&smc_lgr_list.lock); - list_add(&lgr->list, &smc_lgr_list.list); - spin_unlock_bh(&smc_lgr_list.lock); + spin_lock_bh(lgr_lock); + list_add(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); return 0; destroy_qp: @@ -309,7 +363,7 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd) { + if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], @@ -335,14 +389,16 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr) return; if (lgr->is_smcd) { - smc_ism_unset_conn(conn); + if (!list_empty(&lgr->list)) + smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn, lgr); /* allow buffer reuse */ - conn->lgr = NULL; + if (!list_empty(&lgr->list)) { + smc_lgr_unregister_conn(conn); + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); @@ -357,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) + wake_up(&lnk->smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -433,24 +491,101 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - else + if (lgr->is_smcd) { + if (!lgr->terminating) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + struct list_head *lgr_list; + spinlock_t *lgr_lock; + + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); +} + +static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + struct smc_buf_desc *buf_desc; + + list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } + } } -/* terminate linkgroup abnormally */ -static void __smc_lgr_terminate(struct smc_link_group *lgr) +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); +} + +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn, bool soft) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + smc_close_abort(conn); + conn->killed = 1; + smc->sk.sk_err = ECONNABORTED; + smc_sk_wake_ups(smc); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + if (soft) + tasklet_kill(&conn->rx_tsklet); + else + tasklet_unlock_wait(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } + smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); +} + +static void smc_lgr_cleanup(struct smc_link_group *lgr) +{ + if (lgr->is_smcd) { + smc_ism_signal_shutdown(lgr); + smcd_unregister_all_dmbs(lgr); + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } else { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + wake_up(&lnk->wr_reg_wait); + if (lnk->state != SMC_LNK_INACTIVE) { + smc_link_send_delete(lnk, false); + smc_llc_link_inactive(lnk); + } + } +} + +/* terminate link group */ +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; @@ -458,80 +593,161 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ + if (!soft) + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!list_empty(&lgr->list)) /* forget lgr */ - list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - write_lock_bh(&lgr->conns_lock); + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { + read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); /* sock_put in close work */ - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - __smc_lgr_unregister_conn(conn); - conn->lgr = NULL; - write_unlock_bh(&lgr->conns_lock); - if (!schedule_work(&conn->close_work)) - sock_put(&smc->sk); - write_lock_bh(&lgr->conns_lock); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn, soft); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } - write_unlock_bh(&lgr->conns_lock); - if (!lgr->is_smcd) - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work(lgr); + read_unlock_bh(&lgr->conns_lock); + smc_lgr_cleanup(lgr); + if (soft) + smc_lgr_schedule_free_work_fast(lgr); + else + smc_lgr_free(lgr); } -void smc_lgr_terminate(struct smc_link_group *lgr) +/* unlink and terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { - spin_lock_bh(&smc_lgr_list.lock); - __smc_lgr_terminate(lgr); - spin_unlock_bh(&smc_lgr_list.lock); + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->terminating) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + if (!soft) + lgr->freeing = 1; + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, soft); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - __smc_lgr_terminate(lgr); + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } } -/* Called when SMC-D device is terminated or peer is lost */ +/* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd && lgr->smcd == dev && - (!peer_gid || lgr->peer_gid == peer_gid) && + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - __smc_lgr_terminate(lgr); + if (peer_gid) /* peer triggered termination */ + lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); } } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); - cancel_delayed_work_sync(&lgr->free_work); - if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); + schedule_work(&lgr->terminate_work); + } +} + +/* Called when an SMCD device is removed or the smc module is unloaded */ +void smc_smcd_terminate_all(struct smcd_dev *smcd) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smcd->lgr_lock); + list_splice_init(&smcd->lgr_list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + spin_unlock_bh(&smcd->lgr_lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (atomic_read(&smcd->lgr_cnt)) + wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); +} + +/* Called when an SMCR device is removed or the smc module is unloaded. + * If smcibdev is given, all SMCR link groups using this device are terminated. + * If smcibdev is NULL, all SMCR link groups are terminated. + */ +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!smcibdev) { + list_splice_init(&smc_lgr_list.list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + } else { + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (smcibdev) { + if (atomic_read(&smcibdev->lnk_cnt)) + wait_event(smcibdev->lnks_deleted, + !atomic_read(&smcibdev->lnk_cnt)); + } else { + if (atomic_read(&lgr_cnt)) + wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } @@ -607,10 +823,14 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; + spinlock_t *lgr_lock; int rc = 0; + lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->srv_first_contact) @@ -618,8 +838,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto create; /* determine if an existing link group can be reused */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry(lgr, &smc_lgr_list.list, list) { + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : @@ -639,7 +859,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } write_unlock_bh(&lgr->conns_lock); } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -1027,29 +1247,63 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return 0; } -/* Called (from smc_exit) when module is removed */ -void smc_core_exit(void) +static void smc_core_going_away(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - if (!lgr->is_smcd) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; - if (lnk->state == SMC_LNK_ACTIVE) - smc_llc_send_delete_link(lnk, SMC_LLC_REQ, - false); - smc_llc_link_inactive(lnk); - } - cancel_delayed_work_sync(&lgr->free_work); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); /* free link group */ + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); + } + spin_unlock(&smc_ib_devices.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; } + spin_unlock(&smcd_dev_list.lock); +} + +/* Clean up all SMC link groups */ +static void smc_lgrs_shutdown(void) +{ + struct smcd_dev *smcd; + + smc_core_going_away(); + + smc_smcr_terminate_all(NULL); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + smc_smcd_terminate_all(smcd); + spin_unlock(&smcd_dev_list.lock); +} + +static int smc_core_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + smc_lgrs_shutdown(); + + return 0; +} + +static struct notifier_block smc_reboot_notifier = { + .notifier_call = smc_core_reboot_event, +}; + +int __init smc_core_init(void) +{ + atomic_set(&lgr_cnt, 0); + return register_reboot_notifier(&smc_reboot_notifier); +} + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + unregister_reboot_notifier(&smc_reboot_notifier); + smc_lgrs_shutdown(); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c00ac61dc129..c472e12951d1 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -202,8 +202,11 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + u8 freefast : 1; /* free worker scheduled fast */ + u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ union { @@ -225,6 +228,8 @@ struct smc_link_group { /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ + u8 peer_shutdown : 1; + /* peer triggered shutdownn */ }; }; }; @@ -280,15 +285,23 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + if (!lgr->terminating && !lgr->freeing) + schedule_work(&lgr->terminate_work); +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); -void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); +void smc_smcd_terminate_all(struct smcd_dev *dev); +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, @@ -305,6 +318,7 @@ void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); +int smc_core_init(void); void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d14ca4af6f94..548632621f4b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -242,8 +243,12 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); - if (!smc_ib_port_active(smcibdev, port_idx + 1)) + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); smc_port_terminate(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + } } } @@ -259,8 +264,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: /* terminate all ports on device */ - for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); + } schedule_work(&smcibdev->port_event_work); break; case IB_EVENT_PORT_ERR: @@ -269,6 +276,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, port_idx = ibevent->element.port_num - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + if (ibevent->event == IB_EVENT_PORT_ERR) + set_bit(port_idx, smcibdev->ports_going_away); + else if (ibevent->event == IB_EVENT_PORT_ACTIVE) + clear_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -307,6 +318,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) port_idx = ibevent->element.qp->port - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -509,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) return; smcibdev->initialized = 0; - smc_wr_remove_dev(smcibdev); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); } static struct ib_client smc_ib_client; @@ -532,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); - + atomic_set(&smcibdev->lnk_cnt, 0); + init_waitqueue_head(&smcibdev->lnks_deleted); spin_lock(&smc_ib_devices.lock); list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); @@ -554,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev) schedule_work(&smcibdev->port_event_work); } -/* callback function for ib_register_client() */ +/* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev; @@ -564,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index da60ab9e8d70..255db87547d3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <net/smc.h> @@ -47,6 +48,9 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); + atomic_t lnk_cnt; /* number of links on ibdev */ + wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e89e918b88e0..5c4727d5066e 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -146,6 +146,10 @@ out: int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; + int rc = 0; + + if (!dmb_desc->dma_addr) + return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - return smcd->ops->unregister_dmb(smcd, &dmb); + rc = smcd->ops->unregister_dmb(smcd, &dmb); + if (!rc || rc == ISM_ERROR) { + dmb_desc->cpu_addr = NULL; + dmb_desc->dma_addr = 0; + } + + return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, @@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) int rc; union smcd_sw_event_info ev_info; + if (lgr->peer_shutdown) + return 0; + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; @@ -286,7 +299,10 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) { @@ -311,11 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { spin_lock(&smcd_dev_list.lock); - list_del(&smcd->list); + list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } @@ -342,6 +359,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) { struct smc_ism_event_work *wrk; + if (smcd->going_away) + return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) @@ -367,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; - if (conn) + if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4fd60c522802..a9f6431dd69a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_terminate_sched(lgr); } } @@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate(smc_get_lgr(link), true); return; } next_interval = link->llc_testlink_time; @@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) void smc_llc_link_deleting(struct smc_link *link) { link->state = SMC_LNK_DELETING; + smc_wr_wakeup_tx_wait(link); } /* called in tasklet context */ @@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link) { link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* called in worker context */ @@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - int rc; + int rc = 0; mutex_lock(&link->llc_delete_rkey_mutex); + if (link->state != SMC_LNK_ACTIVE) + goto out; reinit_completion(&link->llc_delete_rkey); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 571e6d84da3b..82dedf052d86 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -779,6 +779,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -818,6 +819,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL)) { ini->ib_dev = ibdev; @@ -844,7 +846,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { - if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away) { ini->ism_dev = ismdev; break; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 97e8369002d7..39d7b34d06d2 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; int rc; @@ -210,7 +212,9 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || + cflags->peer_conn_abort || sk->sk_shutdown & RCV_SHUTDOWN || + conn->killed || fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); @@ -314,11 +318,13 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; + if (conn->killed) + break; + if (smc_rx_recvmsg_data_available(smc)) goto copy; - if (sk->sk_shutdown & RCV_SHUTDOWN || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) { + if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 6c8f09c1ce51..0d42e7716b91 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -86,6 +86,7 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; @@ -155,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; @@ -282,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - smc_lgr_terminate(lgr); - } + if (rc) + smc_lgr_terminate(lgr, true); return rc; } @@ -495,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; rc = 0; - if (conn->alert_token_local) /* connection healthy */ - mod_delayed_work(system_wq, &conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } return rc; } @@ -547,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -573,9 +576,7 @@ void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); - if (smc->sk.sk_err || - !conn->alert_token_local || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + if (smc->sk.sk_err) goto out; rc = smc_tx_sndbuf_nonempty(conn); @@ -608,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && - conn->alert_token_local) { /* connection healthy */ + !conn->killed) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 253aa75dc2b6..337ee52ad3d3 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /*------------------------------- completion --------------------------------*/ +/* returns true if at least one tx work request is pending on the given link */ +static inline bool smc_wr_is_tx_pend(struct smc_link *link) +{ + if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != + link->wr_tx_cnt) { + return true; + } + return false; +} + +/* wait till all pending tx work requests on the given link are completed */ +static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +{ + if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), + SMC_WR_TX_WAIT_PENDING_TIME)) + return 0; + else /* timeout */ + return -EPIPE; +} + static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) { u32 i; @@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) link->wr_reg_state = FAILED; else link->wr_reg_state = CONFIRMED; - wake_up(&link->wr_reg_wait); + smc_wr_wakeup_reg_wait(link); return; } @@ -101,7 +121,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, struct smc_rdma_wr **wr_rdma_buf, struct smc_wr_tx_pend_priv **wr_pend_priv) { + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_wr_tx_pend *wr_pend; u32 idx = link->wr_tx_cnt; struct ib_send_wr *wr_ib; @@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, *wr_buf = NULL; *wr_pend_priv = NULL; - if (in_softirq()) { + if (in_softirq() || lgr->terminating) { rc = smc_wr_tx_get_free_slot_index(link, &idx); if (rc) return rc; } else { - rc = wait_event_timeout( + rc = wait_event_interruptible_timeout( link->wr_tx_wait, link->state == SMC_LNK_INACTIVE || + lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(lgr); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); + wake_up(&link->wr_tx_wait); return 1; } @@ -247,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } return rc; } @@ -272,7 +295,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -373,7 +396,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + if (smc_wr_tx_wait_no_pending_sends(lnk)) + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * + sizeof(*lnk->wr_tx_mask)); if (!lnk->smcibdev) return; diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 09bf32fd3959..3ac99c898418 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) +{ + wake_up_all(&lnk->wr_tx_wait); +} + +static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) +{ + wake_up(&lnk->wr_reg_wait); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index b83e16ade4d2..716b61a701a8 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -35,6 +35,21 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y +config TIPC_CRYPTO + bool "TIPC encryption support" + depends on TIPC + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + help + Saying Y here will enable support for TIPC encryption. + All TIPC messages will be encrypted/decrypted by using the currently most + advanced algorithm: AEAD AES-GCM (like IPSec or TLS) before leaving/ + entering the TIPC stack. + Key setting from user-space is performed via netlink by a user program + (e.g. the iproute2 'tipc' tool). + bool + default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index c86aba0282af..11255e970dd4 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -16,6 +16,7 @@ CFLAGS_trace.o += -I$(src) tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o +tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o obj-$(CONFIG_TIPC_DIAG) += diag.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6ef1abdd525f..55aeba681cf4 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -84,12 +84,12 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) */ int tipc_bcast_get_mtu(struct net *net) { - return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; + return tipc_link_mss(tipc_bc_sndlink(net)); } -void tipc_bcast_disable_rcast(struct net *net) +void tipc_bcast_toggle_rcast(struct net *net, bool supp) { - tipc_bc_base(net)->rcast_support = false; + tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..9e847d9617d3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -void tipc_bcast_disable_rcast(struct net *net); +void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 0214aa1c4427..d7ec26bd739d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "udp_media.h" #include "trace.h" +#include "crypto.h" #define MAX_ADDR_STR 60 @@ -315,6 +316,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->net_plane = bearer_id + 'A'; b->priority = prio; test_and_set_bit_lock(0, &b->up); + refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -351,6 +353,17 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +bool tipc_bearer_hold(struct tipc_bearer *b) +{ + return (b && refcount_inc_not_zero(&b->refcnt)); +} + +void tipc_bearer_put(struct tipc_bearer *b) +{ + if (b && refcount_dec_and_test(&b->refcnt)) + kfree_rcu(b, rcu); +} + /** * bearer_disable * @@ -369,7 +382,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) if (b->disc) tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); - kfree_rcu(b, rcu); + tipc_bearer_put(b); tipc_mon_delete(net, bearer_id); } @@ -504,10 +517,15 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, rcu_read_lock(); b = bearer_get(net, bearer_id); - if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) - b->media->send_msg(net, skb, b, dest); - else + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dest, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dest); + } else { kfree_skb(skb); + } rcu_read_unlock(); } @@ -515,7 +533,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, */ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst) + struct tipc_media_addr *dst, + struct tipc_node *__dnode) { struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -529,10 +548,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) - b->media->send_msg(net, skb, b, dst); - else + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) { +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); + } else { kfree_skb(skb); + } } rcu_read_unlock(); } @@ -543,6 +567,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq) { struct tipc_net *tn = tipc_net(net); + struct tipc_media_addr *dst; int net_id = tn->net_id; struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -557,7 +582,12 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, msg_set_non_seq(hdr, 1); msg_set_mc_netid(hdr, net_id); __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, &b->bcast_addr); + dst = &b->bcast_addr; +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_xmit(net, &skb, b, dst, NULL); + if (skb) +#endif + b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } @@ -584,6 +614,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index ea0f3c49cbed..d0c79cc6c0c2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -165,6 +165,7 @@ struct tipc_bearer { struct tipc_discoverer *disc; char net_plane; unsigned long up; + refcount_t refcnt; }; struct tipc_bearer_names { @@ -210,6 +211,8 @@ int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); +bool tipc_bearer_hold(struct tipc_bearer *b); +void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); @@ -229,7 +232,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, - struct tipc_media_addr *dst); + struct tipc_media_addr *dst, + struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); diff --git a/net/tipc/core.c b/net/tipc/core.c index 8f35060a24e1..7532a00ac73d 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -42,6 +42,7 @@ #include "socket.h" #include "bcast.h" #include "node.h" +#include "crypto.h" #include <linux/module.h> @@ -66,6 +67,11 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); +#ifdef CONFIG_TIPC_CRYPTO + err = tipc_crypto_start(&tn->crypto_tx, net, NULL); + if (err) + goto out_crypto; +#endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -91,6 +97,11 @@ out_bclink: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tn->crypto_tx); +out_crypto: +#endif return err; } @@ -101,8 +112,20 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&tipc_net(net)->crypto_tx); +#endif +} + +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); } +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -149,6 +172,10 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -156,6 +183,8 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -175,6 +204,7 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); diff --git a/net/tipc/core.h b/net/tipc/core.h index 3042f654e0af..631d83c9705f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,7 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> +#include <net/netns/hash.h> #ifdef pr_fmt #undef pr_fmt @@ -73,6 +74,9 @@ struct tipc_link; struct tipc_name_table; struct tipc_topsrv; struct tipc_monitor; +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto; +#endif #define TIPC_MOD_VER "2.0.0" @@ -134,6 +138,11 @@ struct tipc_net { /* Tracing of node internal messages */ struct packet_type loopback_pt; + +#ifdef CONFIG_TIPC_CRYPTO + /* TX crypto handler */ + struct tipc_crypto *crypto_tx; +#endif }; static inline struct tipc_net *tipc_net(struct net *net) @@ -191,6 +200,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c new file mode 100644 index 000000000000..990a872cec46 --- /dev/null +++ b/net/tipc/crypto.c @@ -0,0 +1,1986 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <crypto/aead.h> +#include <crypto/aes.h> +#include "crypto.h" + +#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */ +#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */ +#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ +#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */ +#define TIPC_MAX_TFMS_DEF 10 +#define TIPC_MAX_TFMS_LIM 1000 + +/** + * TIPC Key ids + */ +enum { + KEY_UNUSED = 0, + KEY_MIN, + KEY_1 = KEY_MIN, + KEY_2, + KEY_3, + KEY_MAX = KEY_3, +}; + +/** + * TIPC Crypto statistics + */ +enum { + STAT_OK, + STAT_NOK, + STAT_ASYNC, + STAT_ASYNC_OK, + STAT_ASYNC_NOK, + STAT_BADKEYS, /* tx only */ + STAT_BADMSGS = STAT_BADKEYS, /* rx only */ + STAT_NOKEYS, + STAT_SWITCHES, + + MAX_STATS, +}; + +/* TIPC crypto statistics' header */ +static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", + "async_nok", "badmsgs", "nokeys", + "switches"}; + +/* Max TFMs number per key */ +int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; + +/** + * struct tipc_key - TIPC keys' status indicator + * + * 7 6 5 4 3 2 1 0 + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * key: | (reserved)|passive idx| active idx|pending idx| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + */ +struct tipc_key { +#define KEY_BITS (2) +#define KEY_MASK ((1 << KEY_BITS) - 1) + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 pending:2, + active:2, + passive:2, /* rx only */ + reserved:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:2, + passive:2, /* rx only */ + active:2, + pending:2; +#else +#error "Please fix <asm/byteorder.h>" +#endif + } __packed; + u8 keys; + }; +}; + +/** + * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + */ +struct tipc_tfm { + struct crypto_aead *tfm; + struct list_head list; +}; + +/** + * struct tipc_aead - TIPC AEAD key structure + * @tfm_entry: per-cpu pointer to one entry in TFM list + * @crypto: TIPC crypto owns this key + * @cloned: reference to the source key in case cloning + * @users: the number of the key users (TX/RX) + * @salt: the key's SALT value + * @authsize: authentication tag size (max = 16) + * @mode: crypto mode is applied to the key + * @hint[]: a hint for user key + * @rcu: struct rcu_head + * @seqno: the key seqno (cluster scope) + * @refcnt: the key reference counter + */ +struct tipc_aead { +#define TIPC_AEAD_HINT_LEN (5) + struct tipc_tfm * __percpu *tfm_entry; + struct tipc_crypto *crypto; + struct tipc_aead *cloned; + atomic_t users; + u32 salt; + u8 authsize; + u8 mode; + char hint[TIPC_AEAD_HINT_LEN + 1]; + struct rcu_head rcu; + + atomic64_t seqno ____cacheline_aligned; + refcount_t refcnt ____cacheline_aligned; + +} ____cacheline_aligned; + +/** + * struct tipc_crypto_stats - TIPC Crypto statistics + */ +struct tipc_crypto_stats { + unsigned int stat[MAX_STATS]; +}; + +/** + * struct tipc_crypto - TIPC TX/RX crypto structure + * @net: struct net + * @node: TIPC node (RX) + * @aead: array of pointers to AEAD keys for encryption/decryption + * @peer_rx_active: replicated peer RX active key index + * @key: the key states + * @working: the crypto is working or not + * @stats: the crypto statistics + * @sndnxt: the per-peer sndnxt (TX) + * @timer1: general timer 1 (jiffies) + * @timer2: general timer 1 (jiffies) + * @lock: tipc_key lock + */ +struct tipc_crypto { + struct net *net; + struct tipc_node *node; + struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + atomic_t peer_rx_active; + struct tipc_key key; + u8 working:1; + struct tipc_crypto_stats __percpu *stats; + + atomic64_t sndnxt ____cacheline_aligned; + unsigned long timer1; + unsigned long timer2; + spinlock_t lock; /* crypto lock */ + +} ____cacheline_aligned; + +/* struct tipc_crypto_tx_ctx - TX context for callbacks */ +struct tipc_crypto_tx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; + struct tipc_media_addr dst; +}; + +/* struct tipc_crypto_rx_ctx - RX context for callbacks */ +struct tipc_crypto_rx_ctx { + struct tipc_aead *aead; + struct tipc_bearer *bearer; +}; + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); +static inline void tipc_aead_put(struct tipc_aead *aead); +static void tipc_aead_free(struct rcu_head *rp); +static int tipc_aead_users(struct tipc_aead __rcu *aead); +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode); +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg); +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode); +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err); +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b); +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err); +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx); +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending); +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos); +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb); +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr); +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err); +static void tipc_crypto_do_cmd(struct net *net, int cmd); +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf); +#endif + +#define key_next(cur) ((cur) % KEY_MAX + 1) + +#define tipc_aead_rcu_ptr(rcu_ptr, lock) \ + rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ + rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) + +#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ +do { \ + typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ + lockdep_is_held(lock)); \ + rcu_assign_pointer((rcu_ptr), (ptr)); \ + tipc_aead_put(__tmp); \ +} while (0) + +#define tipc_crypto_key_detach(rcu_ptr, lock) \ + tipc_aead_rcu_replace((rcu_ptr), NULL, lock) + +/** + * tipc_aead_key_validate - Validate a AEAD user key + */ +int tipc_aead_key_validate(struct tipc_aead_key *ukey) +{ + int keylen; + + /* Check if algorithm exists */ + if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { + pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name); + return -ENODEV; + } + + /* Currently, we only support the "gcm(aes)" cipher algorithm */ + if (strcmp(ukey->alg_name, "gcm(aes)")) + return -ENOTSUPP; + + /* Check if key size is correct */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && + keylen != TIPC_AES_GCM_KEY_SIZE_192 && + keylen != TIPC_AES_GCM_KEY_SIZE_256)) + return -EINVAL; + + return 0; +} + +static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) + tmp = NULL; + rcu_read_unlock(); + + return tmp; +} + +static inline void tipc_aead_put(struct tipc_aead *aead) +{ + if (aead && refcount_dec_and_test(&aead->refcnt)) + call_rcu(&aead->rcu, tipc_aead_free); +} + +/** + * tipc_aead_free - Release AEAD key incl. all the TFMs in the list + * @rp: rcu head pointer + */ +static void tipc_aead_free(struct rcu_head *rp) +{ + struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); + struct tipc_tfm *tfm_entry, *head, *tmp; + + if (aead->cloned) { + tipc_aead_put(aead->cloned); + } else { + head = *this_cpu_ptr(aead->tfm_entry); + list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { + crypto_free_aead(tfm_entry->tfm); + list_del(&tfm_entry->list); + kfree(tfm_entry); + } + /* Free the head */ + crypto_free_aead(head->tfm); + list_del(&head->list); + kfree(head); + } + free_percpu(aead->tfm_entry); + kfree(aead); +} + +static int tipc_aead_users(struct tipc_aead __rcu *aead) +{ + struct tipc_aead *tmp; + int users = 0; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + users = atomic_read(&tmp->users); + rcu_read_unlock(); + + return users; +} + +static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&tmp->users, 1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) +{ + struct tipc_aead *tmp; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) + atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + rcu_read_unlock(); +} + +static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) +{ + struct tipc_aead *tmp; + int cur; + + rcu_read_lock(); + tmp = rcu_dereference(aead); + if (tmp) { + do { + cur = atomic_read(&tmp->users); + if (cur == val) + break; + } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); + } + rcu_read_unlock(); +} + +/** + * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + */ +static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) +{ + struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry); + + *tfm_entry = list_next_entry(*tfm_entry, list); + return (*tfm_entry)->tfm; +} + +/** + * tipc_aead_init - Initiate TIPC AEAD + * @aead: returned new TIPC AEAD key handle pointer + * @ukey: pointer to user key data + * @mode: the key mode + * + * Allocate a (list of) new cipher transformation (TFM) with the specific user + * key data if valid. The number of the allocated TFMs can be set via the sysfs + * "net/tipc/max_tfms" first. + * Also, all the other AEAD data are also initialized. + * + * Return: 0 if the initiation is successful, otherwise: < 0 + */ +static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_tfm *tfm_entry, *head; + struct crypto_aead *tfm; + struct tipc_aead *tmp; + int keylen, err, cpu; + int tfm_cnt = 0; + + if (unlikely(*aead)) + return -EEXIST; + + /* Allocate a new AEAD */ + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (unlikely(!tmp)) + return -ENOMEM; + + /* The key consists of two parts: [AES-KEY][SALT] */ + keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; + + /* Allocate per-cpu TFM entry pointer */ + tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); + if (!tmp->tfm_entry) { + kzfree(tmp); + return -ENOMEM; + } + + /* Make a list of TFMs with the user key data */ + do { + tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + break; + } + + if (unlikely(!tfm_cnt && + crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { + crypto_free_aead(tfm); + err = -ENOTSUPP; + break; + } + + err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); + err |= crypto_aead_setkey(tfm, ukey->key, keylen); + if (unlikely(err)) { + crypto_free_aead(tfm); + break; + } + + tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); + if (unlikely(!tfm_entry)) { + crypto_free_aead(tfm); + err = -ENOMEM; + break; + } + INIT_LIST_HEAD(&tfm_entry->list); + tfm_entry->tfm = tfm; + + /* First entry? */ + if (!tfm_cnt) { + head = tfm_entry; + for_each_possible_cpu(cpu) { + *per_cpu_ptr(tmp->tfm_entry, cpu) = head; + } + } else { + list_add_tail(&tfm_entry->list, &head->list); + } + + } while (++tfm_cnt < sysctl_tipc_max_tfms); + + /* Not any TFM is allocated? */ + if (!tfm_cnt) { + free_percpu(tmp->tfm_entry); + kzfree(tmp); + return err; + } + + /* Copy some chars from the user key as a hint */ + memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN); + tmp->hint[TIPC_AEAD_HINT_LEN] = '\0'; + + /* Initialize the other data */ + tmp->mode = mode; + tmp->cloned = NULL; + tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); + atomic_set(&tmp->users, 0); + atomic64_set(&tmp->seqno, 0); + refcount_set(&tmp->refcnt, 1); + + *aead = tmp; + return 0; +} + +/** + * tipc_aead_clone - Clone a TIPC AEAD key + * @dst: dest key for the cloning + * @src: source key to clone from + * + * Make a "copy" of the source AEAD key data to the dest, the TFMs list is + * common for the keys. + * A reference to the source is hold in the "cloned" pointer for the later + * freeing purposes. + * + * Note: this must be done in cluster-key mode only! + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) +{ + struct tipc_aead *aead; + int cpu; + + if (!src) + return -ENOKEY; + + if (src->mode != CLUSTER_KEY) + return -EINVAL; + + if (unlikely(*dst)) + return -EEXIST; + + aead = kzalloc(sizeof(*aead), GFP_ATOMIC); + if (unlikely(!aead)) + return -ENOMEM; + + aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); + if (unlikely(!aead->tfm_entry)) { + kzfree(aead); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + *per_cpu_ptr(aead->tfm_entry, cpu) = + *per_cpu_ptr(src->tfm_entry, cpu); + } + + memcpy(aead->hint, src->hint, sizeof(src->hint)); + aead->mode = src->mode; + aead->salt = src->salt; + aead->authsize = src->authsize; + atomic_set(&aead->users, 0); + atomic64_set(&aead->seqno, 0); + refcount_set(&aead->refcnt, 1); + + WARN_ON(!refcount_inc_not_zero(&src->refcnt)); + aead->cloned = src; + + *dst = aead; + return 0; +} + +/** + * tipc_aead_mem_alloc - Allocate memory for AEAD request operations + * @tfm: cipher handle to be registered with the request + * @crypto_ctx_size: size of crypto context for callback + * @iv: returned pointer to IV data + * @req: returned pointer to AEAD request data + * @sg: returned pointer to SG lists + * @nsg: number of SG lists to be allocated + * + * Allocate memory to store the crypto context data, AEAD request, IV and SG + * lists, the memory layout is as follows: + * crypto_ctx || iv || aead_req || sg[] + * + * Return: the pointer to the memory areas in case of success, otherwise NULL + */ +static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, + unsigned int crypto_ctx_size, + u8 **iv, struct aead_request **req, + struct scatterlist **sg, int nsg) +{ + unsigned int iv_size, req_size; + unsigned int len; + u8 *mem; + + iv_size = crypto_aead_ivsize(tfm); + req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + + len = crypto_ctx_size; + len += iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += nsg * sizeof(**sg); + + mem = kmalloc(len, GFP_ATOMIC); + if (!mem) + return NULL; + + *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, + crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + + return (void *)mem; +} + +/** + * tipc_aead_encrypt - Encrypt a message + * @aead: TIPC AEAD key for the message encryption + * @skb: the input/output skb + * @b: TIPC bearer where the message will be delivered after the encryption + * @dst: the destination media address + * @__dnode: TIPC dest node if "known" + * + * Return: + * 0 : if the encryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the encryption has failed + */ +static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct crypto_aead *tfm = tipc_aead_tfm_next(aead); + struct tipc_crypto_tx_ctx *tx_ctx; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, len, tailen, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + /* Make sure message len at least 4-byte aligned */ + len = ALIGN(skb->len, 4); + tailen = len - skb->len + aead->authsize; + + /* Expand skb tail for authentication tag: + * As for simplicity, we'd have made sure skb having enough tailroom + * for authentication tag @skb allocation. Even when skb is nonlinear + * but there is no frag_list, it should be still fine! + * Otherwise, we must cow it to be a writable buffer with the tailroom. + */ +#ifdef TIPC_CRYPTO_DEBUG + SKB_LINEAR_ASSERT(skb); + if (tailen > skb_tailroom(skb)) { + pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n", + skb_tailroom(skb), tailen); + } +#endif + + if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) { + nsg = 1; + trailer = skb; + } else { + /* TODO: We could avoid skb_cow_data() if skb has no frag_list + * e.g. by skb_fill_page_desc() to add another page to the skb + * with the wanted tailen... However, page skbs look not often, + * so take it easy now! + * Cloned skbs e.g. from link_xmit() seems no choice though :( + */ + nsg = skb_cow_data(skb, tailen, &trailer); + if (unlikely(nsg < 0)) { + pr_err("TX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + pskb_put(skb, trailer, tailen); + + /* Allocate memory for the AEAD operation */ + ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); + goto exit; + } + + /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] + * In case we're in cluster-key mode, SALT is varied by xor-ing with + * the source address (or w0 of id), otherwise with the dest address + * if dest is known. + */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (__dnode) + salt ^= tipc_node_get_addr(__dnode); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_encrypt_done, skb); + tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; + tx_ctx->aead = aead; + tx_ctx->bearer = b; + memcpy(&tx_ctx->dst, dst, sizeof(*dst)); + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do encrypt */ + rc = crypto_aead_encrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = tx_ctx->bearer; + struct tipc_aead *aead = tx_ctx->aead; + struct tipc_crypto *tx = aead->crypto; + struct net *net = tx->net; + + switch (err) { + case 0: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); + if (likely(test_bit(0, &b->up))) + b->media->send_msg(net, skb, b, &tx_ctx->dst); + else + kfree_skb(skb); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); + kfree_skb(skb); + break; + } + + kfree(tx_ctx); + tipc_bearer_put(b); + tipc_aead_put(aead); +} + +/** + * tipc_aead_decrypt - Decrypt an encrypted message + * @net: struct net + * @aead: TIPC AEAD for the message decryption + * @skb: the input/output skb + * @b: TIPC bearer where the message has been received + * + * Return: + * 0 : if the decryption has completed + * -EINPROGRESS/-EBUSY : if a callback will be performed + * < 0 : the decryption has failed + */ +static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, + struct sk_buff *skb, struct tipc_bearer *b) +{ + struct tipc_crypto_rx_ctx *rx_ctx; + struct aead_request *req; + struct crypto_aead *tfm; + struct sk_buff *unused; + struct scatterlist *sg; + struct tipc_ehdr *ehdr; + int ehsz, nsg, rc; + void *ctx; + u32 salt; + u8 *iv; + + if (unlikely(!aead)) + return -ENOKEY; + + /* Cow skb data if needed */ + if (likely(!skb_cloned(skb) && + (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { + nsg = 1 + skb_shinfo(skb)->nr_frags; + } else { + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; + } + } + + /* Allocate memory for the AEAD operation */ + tfm = tipc_aead_tfm_next(aead); + ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); + if (unlikely(!ctx)) + return -ENOMEM; + TIPC_SKB_CB(skb)->crypto_ctx = ctx; + + /* Map skb to the sg lists */ + sg_init_table(sg, nsg); + rc = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(rc < 0)) { + pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); + goto exit; + } + + /* Reconstruct IV: */ + ehdr = (struct tipc_ehdr *)skb->data; + salt = aead->salt; + if (aead->mode == CLUSTER_KEY) + salt ^= ehdr->addr; /* __be32 */ + else if (ehdr->destined) + salt ^= tipc_own_addr(net); + memcpy(iv, &salt, 4); + memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); + + /* Prepare request */ + ehsz = tipc_ehdr_size(ehdr); + aead_request_set_tfm(req, tfm); + aead_request_set_ad(req, ehsz); + aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); + + /* Set callback function & data */ + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + tipc_aead_decrypt_done, skb); + rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; + rx_ctx->aead = aead; + rx_ctx->bearer = b; + + /* Hold bearer */ + if (unlikely(!tipc_bearer_hold(b))) { + rc = -ENODEV; + goto exit; + } + + /* Now, do decrypt */ + rc = crypto_aead_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) + return rc; + + tipc_bearer_put(b); + +exit: + kfree(ctx); + TIPC_SKB_CB(skb)->crypto_ctx = NULL; + return rc; +} + +static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; + struct tipc_bearer *b = rx_ctx->bearer; + struct tipc_aead *aead = rx_ctx->aead; + struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; + struct net *net = aead->crypto->net; + + switch (err) { + case 0: + this_cpu_inc(stats->stat[STAT_ASYNC_OK]); + break; + case -EINPROGRESS: + return; + default: + this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); + break; + } + + kfree(rx_ctx); + tipc_crypto_rcv_complete(net, aead, b, &skb, err); + if (likely(skb)) { + if (likely(test_bit(0, &b->up))) + tipc_rcv(net, skb, b); + else + kfree_skb(skb); + } + + tipc_bearer_put(b); +} + +static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) +{ + return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; +} + +/** + * tipc_ehdr_validate - Validate an encryption message + * @skb: the message buffer + * + * Returns "true" if this is a valid encryption message, otherwise "false" + */ +bool tipc_ehdr_validate(struct sk_buff *skb) +{ + struct tipc_ehdr *ehdr; + int ehsz; + + if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) + return false; + + ehdr = (struct tipc_ehdr *)skb->data; + if (unlikely(ehdr->version != TIPC_EVERSION)) + return false; + ehsz = tipc_ehdr_size(ehdr); + if (unlikely(!pskb_may_pull(skb, ehsz))) + return false; + if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) + return false; + if (unlikely(!ehdr->tx_key)) + return false; + + return true; +} + +/** + * tipc_ehdr_build - Build TIPC encryption message header + * @net: struct net + * @aead: TX AEAD key to be used for the message encryption + * @tx_key: key id used for the message encryption + * @skb: input/output message skb + * @__rx: RX crypto handle if dest is "known" + * + * Return: the header size if the building is successful, otherwise < 0 + */ +static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, + u8 tx_key, struct sk_buff *skb, + struct tipc_crypto *__rx) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_ehdr *ehdr; + u32 user = msg_user(hdr); + u64 seqno; + int ehsz; + + /* Make room for encryption header */ + ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; + WARN_ON(skb_headroom(skb) < ehsz); + ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); + + /* Obtain a seqno first: + * Use the key seqno (= cluster wise) if dest is unknown or we're in + * cluster key mode, otherwise it's better for a per-peer seqno! + */ + if (!__rx || aead->mode == CLUSTER_KEY) + seqno = atomic64_inc_return(&aead->seqno); + else + seqno = atomic64_inc_return(&__rx->sndnxt); + + /* Revoke the key if seqno is wrapped around */ + if (unlikely(!seqno)) + return tipc_crypto_key_revoke(net, tx_key); + + /* Word 1-2 */ + ehdr->seqno = cpu_to_be64(seqno); + + /* Words 0, 3- */ + ehdr->version = TIPC_EVERSION; + ehdr->user = 0; + ehdr->keepalive = 0; + ehdr->tx_key = tx_key; + ehdr->destined = (__rx) ? 1 : 0; + ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->reserved_1 = 0; + ehdr->reserved_2 = 0; + + switch (user) { + case LINK_CONFIG: + ehdr->user = LINK_CONFIG; + memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); + break; + default: + if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { + ehdr->user = LINK_PROTOCOL; + ehdr->keepalive = msg_is_keepalive(hdr); + } + ehdr->addr = hdr->hdr[3]; + break; + } + + return ehsz; +} + +static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, + u8 new_passive, + u8 new_active, + u8 new_pending) +{ +#ifdef TIPC_CRYPTO_DEBUG + struct tipc_key old = c->key; + char buf[32]; +#endif + + c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | + ((new_active & KEY_MASK) << (KEY_BITS)) | + ((new_pending & KEY_MASK)); + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("%s(%s): key changing %s ::%pS\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + tipc_key_change_dump(old, c->key, buf), + __builtin_return_address(0)); +#endif +} + +/** + * tipc_crypto_key_init - Initiate a new user / AEAD key + * @c: TIPC crypto to which new key is attached + * @ukey: the user key + * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * + * A new TIPC AEAD key will be allocated and initiated with the specified user + * key, then attached to the TIPC crypto. + * + * Return: new key id in case of success, otherwise: < 0 + */ +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode) +{ + struct tipc_aead *aead = NULL; + int rc = 0; + + /* Initiate with the new user key */ + rc = tipc_aead_init(&aead, ukey, mode); + + /* Attach it to the crypto */ + if (likely(!rc)) { + rc = tipc_crypto_key_attach(c, aead, 0); + if (rc < 0) + tipc_aead_free(&aead->rcu); + } + + pr_info("%s(%s): key initiating, rc %d!\n", + (c->node) ? "RX" : "TX", + (c->node) ? tipc_node_get_id_str(c->node) : + tipc_own_id_string(c->net), + rc); + + return rc; +} + +/** + * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto + * @c: TIPC crypto to which the new AEAD key is attached + * @aead: the new AEAD key pointer + * @pos: desired slot in the crypto key array, = 0 if any! + * + * Return: new key id in case of success, otherwise: -EBUSY + */ +static int tipc_crypto_key_attach(struct tipc_crypto *c, + struct tipc_aead *aead, u8 pos) +{ + u8 new_pending, new_passive, new_key; + struct tipc_key key; + int rc = -EBUSY; + + spin_lock_bh(&c->lock); + key = c->key; + if (key.active && key.passive) + goto exit; + if (key.passive && !tipc_aead_users(c->aead[key.passive])) + goto exit; + if (key.pending) { + if (pos) + goto exit; + if (tipc_aead_users(c->aead[key.pending]) > 0) + goto exit; + /* Replace it */ + new_pending = key.pending; + new_passive = key.passive; + new_key = new_pending; + } else { + if (pos) { + if (key.active && pos != key_next(key.active)) { + new_pending = key.pending; + new_passive = pos; + new_key = new_passive; + goto attach; + } else if (!key.active && !key.passive) { + new_pending = pos; + new_passive = key.passive; + new_key = new_pending; + goto attach; + } + } + new_pending = key_next(key.active ?: key.passive); + new_passive = key.passive; + new_key = new_pending; + } + +attach: + aead->crypto = c; + tipc_crypto_key_set_state(c, new_passive, key.active, new_pending); + tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); + + c->working = 1; + c->timer1 = jiffies; + c->timer2 = jiffies; + rc = new_key; + +exit: + spin_unlock_bh(&c->lock); + return rc; +} + +void tipc_crypto_key_flush(struct tipc_crypto *c) +{ + int k; + + spin_lock_bh(&c->lock); + c->working = 0; + tipc_crypto_key_set_state(c, 0, 0, 0); + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_crypto_key_detach(c->aead[k], &c->lock); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + spin_unlock_bh(&c->lock); +} + +/** + * tipc_crypto_key_try_align - Align RX keys if possible + * @rx: RX crypto handle + * @new_pending: new pending slot if aligned (= TX key from peer) + * + * Peer has used an unknown key slot, this only happens when peer has left and + * rejoned, or we are newcomer. + * That means, there must be no active key but a pending key at unaligned slot. + * If so, we try to move the pending key to the new slot. + * Note: A potential passive key can exist, it will be shifted correspondingly! + * + * Return: "true" if key is successfully aligned, otherwise "false" + */ +static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) +{ + struct tipc_aead *tmp1, *tmp2 = NULL; + struct tipc_key key; + bool aligned = false; + u8 new_passive = 0; + int x; + + spin_lock(&rx->lock); + key = rx->key; + if (key.pending == new_pending) { + aligned = true; + goto exit; + } + if (key.active) + goto exit; + if (!key.pending) + goto exit; + if (tipc_aead_users(rx->aead[key.pending]) > 0) + goto exit; + + /* Try to "isolate" this pending key first */ + tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); + if (!refcount_dec_if_one(&tmp1->refcnt)) + goto exit; + rcu_assign_pointer(rx->aead[key.pending], NULL); + + /* Move passive key if any */ + if (key.passive) { + tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); + x = (key.passive - key.pending + new_pending) % KEY_MAX; + new_passive = (x <= 0) ? x + KEY_MAX : x; + } + + /* Re-allocate the key(s) */ + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + rcu_assign_pointer(rx->aead[new_pending], tmp1); + if (new_passive) + rcu_assign_pointer(rx->aead[new_passive], tmp2); + refcount_set(&tmp1->refcnt, 1); + aligned = true; + pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node)); + +exit: + spin_unlock(&rx->lock); + return aligned; +} + +/** + * tipc_crypto_key_pick_tx - Pick one TX key for message decryption + * @tx: TX crypto handle + * @rx: RX crypto handle (can be NULL) + * @skb: the message skb which will be decrypted later + * + * This function looks up the existing TX keys and pick one which is suitable + * for the message decryption, that must be a cluster key and not used before + * on the same message (i.e. recursive). + * + * Return: the TX AEAD key handle in case of success, otherwise NULL + */ +static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, + struct tipc_crypto *rx, + struct sk_buff *skb) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); + struct tipc_aead *aead = NULL; + struct tipc_key key = tx->key; + u8 k, i = 0; + + /* Initialize data if not yet */ + if (!skb_cb->tx_clone_deferred) { + skb_cb->tx_clone_deferred = 1; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + } + + skb_cb->tx_clone_ctx.rx = rx; + if (++skb_cb->tx_clone_ctx.recurs > 2) + return NULL; + + /* Pick one TX key */ + spin_lock(&tx->lock); + do { + k = (i == 0) ? key.pending : + ((i == 1) ? key.active : key.passive); + if (!k) + continue; + aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); + if (!aead) + continue; + if (aead->mode != CLUSTER_KEY || + aead == skb_cb->tx_clone_ctx.last) { + aead = NULL; + continue; + } + /* Ok, found one cluster key */ + skb_cb->tx_clone_ctx.last = aead; + WARN_ON(skb->next); + skb->next = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb->next)) + pr_warn("Failed to clone skb for next round if any\n"); + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); + break; + } while (++i < 3); + spin_unlock(&tx->lock); + + return aead; +} + +/** + * tipc_crypto_key_synch: Synch own key data according to peer key status + * @rx: RX crypto handle + * @new_rx_active: latest RX active key from peer + * @hdr: TIPCv2 message + * + * This function updates the peer node related data as the peer RX active key + * has changed, so the number of TX keys' users on this node are increased and + * decreased correspondingly. + * + * The "per-peer" sndnxt is also reset when the peer key has switched. + */ +static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active, + struct tipc_msg *hdr) +{ + struct net *net = rx->net; + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + u8 cur_rx_active; + + /* TX might be even not ready yet */ + if (unlikely(!tx->key.active && !tx->key.pending)) + return; + + cur_rx_active = atomic_read(&rx->peer_rx_active); + if (likely(cur_rx_active == new_rx_active)) + return; + + /* Make sure this message destined for this node */ + if (unlikely(msg_short(hdr) || + msg_destnode(hdr) != tipc_own_addr(net))) + return; + + /* Peer RX active key has changed, try to update owns' & TX users */ + if (atomic_cmpxchg(&rx->peer_rx_active, + cur_rx_active, + new_rx_active) == cur_rx_active) { + if (new_rx_active) + tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX); + if (cur_rx_active) + tipc_aead_users_dec(tx->aead[cur_rx_active], 0); + + atomic64_set(&rx->sndnxt, 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n", + tipc_own_id_string(net), cur_rx_active, + new_rx_active, tipc_node_get_id_str(rx->node)); +#endif + } +} + +static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_key key; + + spin_lock(&tx->lock); + key = tx->key; + WARN_ON(!key.active || tx_key != key.active); + + /* Free the active key */ + tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + spin_unlock(&tx->lock); + + pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net)); + return -EKEYREVOKED; +} + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node) +{ + struct tipc_crypto *c; + + if (*crypto) + return -EEXIST; + + /* Allocate crypto */ + c = kzalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return -ENOMEM; + + /* Allocate statistic structure */ + c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); + if (!c->stats) { + kzfree(c); + return -ENOMEM; + } + + c->working = 0; + c->net = net; + c->node = node; + tipc_crypto_key_set_state(c, 0, 0, 0); + atomic_set(&c->peer_rx_active, 0); + atomic64_set(&c->sndnxt, 0); + c->timer1 = jiffies; + c->timer2 = jiffies; + spin_lock_init(&c->lock); + *crypto = c; + + return 0; +} + +void tipc_crypto_stop(struct tipc_crypto **crypto) +{ + struct tipc_crypto *c, *tx, *rx; + bool is_rx; + u8 k; + + if (!*crypto) + return; + + rcu_read_lock(); + /* RX stopping? => decrease TX key users if any */ + is_rx = !!((*crypto)->node); + if (is_rx) { + rx = *crypto; + tx = tipc_net(rx->net)->crypto_tx; + k = atomic_read(&rx->peer_rx_active); + if (k) { + tipc_aead_users_dec(tx->aead[k], 0); + /* Mark the point TX key users changed */ + tx->timer1 = jiffies; + } + } + + /* Release AEAD keys */ + c = *crypto; + for (k = KEY_MIN; k <= KEY_MAX; k++) + tipc_aead_put(rcu_dereference(c->aead[k])); + rcu_read_unlock(); + + pr_warn("%s(%s) has been purged, node left!\n", + (is_rx) ? "RX" : "TX", + (is_rx) ? tipc_node_get_id_str((*crypto)->node) : + tipc_own_id_string((*crypto)->net)); + + /* Free this crypto statistics */ + free_percpu(c->stats); + + *crypto = NULL; + kzfree(c); +} + +void tipc_crypto_timeout(struct tipc_crypto *rx) +{ + struct tipc_net *tn = tipc_net(rx->net); + struct tipc_crypto *tx = tn->crypto_tx; + struct tipc_key key; + u8 new_pending, new_passive; + int cmd; + + /* TX key activating: + * The pending key (users > 0) -> active + * The active key if any (users == 0) -> free + */ + spin_lock(&tx->lock); + key = tx->key; + if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) + goto s1; + if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) + goto s1; + if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM)) + goto s1; + + tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); + if (key.active) + tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); + this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); + pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net), + key.pending); + +s1: + spin_unlock(&tx->lock); + + /* RX key activating: + * The pending key (users > 0) -> active + * The active key if any -> passive, freed later + */ + spin_lock(&rx->lock); + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) + goto s2; + + new_pending = (key.passive && + !tipc_aead_users(rx->aead[key.passive])) ? + key.passive : 0; + new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive); + tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending); + this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); + pr_info("RX(%s): key %d is activated!\n", + tipc_node_get_id_str(rx->node), key.pending); + goto s5; + +s2: + /* RX key "faulty" switching: + * The faulty pending key (users < -30) -> passive + * The passive key (users = 0) -> pending + * Note: This only happens after RX deactivated - s3! + */ + key = rx->key; + if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30) + goto s3; + if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0) + goto s3; + + new_pending = key.passive; + new_passive = key.pending; + tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending); + goto s5; + +s3: + /* RX key deactivating: + * The passive key if any -> pending + * The active key -> passive (users = 0) / pending + * The pending key if any -> passive (users = 0) + */ + key = rx->key; + if (!key.active) + goto s4; + if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM)) + goto s4; + + new_pending = (key.passive) ?: key.active; + new_passive = (key.passive) ? key.active : key.pending; + tipc_aead_users_set(rx->aead[new_pending], 0); + if (new_passive) + tipc_aead_users_set(rx->aead[new_passive], 0); + tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); + pr_info("RX(%s): key %d is deactivated!\n", + tipc_node_get_id_str(rx->node), key.active); + goto s5; + +s4: + /* RX key passive -> freed: */ + key = rx->key; + if (!key.passive || !tipc_aead_users(rx->aead[key.passive])) + goto s5; + if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM)) + goto s5; + + tipc_crypto_key_set_state(rx, 0, key.active, key.pending); + tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); + pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node), + key.passive); + +s5: + spin_unlock(&rx->lock); + + /* Limit max_tfms & do debug commands if needed */ + if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) + return; + + cmd = sysctl_tipc_max_tfms; + sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; + tipc_crypto_do_cmd(rx->net, cmd); +} + +/** + * tipc_crypto_xmit - Build & encrypt TIPC message for xmit + * @net: struct net + * @skb: input/output message skb pointer + * @b: bearer used for xmit later + * @dst: destination media address + * @__dnode: destination node for reference if any + * + * First, build an encryption message header on the top of the message, then + * encrypt the original TIPC message by using the active or pending TX key. + * If the encryption is successful, the encrypted skb is returned directly or + * via the callback. + * Otherwise, the skb is freed! + * + * Return: + * 0 : the encryption has succeeded (or no encryption) + * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * -ENOKEK : the encryption has failed due to no key + * -EKEYREVOKED : the encryption has failed due to key revoked + * -ENOMEM : the encryption has failed due to no memory + * < 0 : the encryption has failed due to other reasons + */ +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode) +{ + struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats = tx->stats; + struct tipc_key key = tx->key; + struct tipc_aead *aead = NULL; + struct sk_buff *probe; + int rc = -ENOKEY; + u8 tx_key; + + /* No encryption? */ + if (!tx->working) + return 0; + + /* Try with the pending key if available and: + * 1) This is the only choice (i.e. no active key) or; + * 2) Peer has switched to this key (unicast only) or; + * 3) It is time to do a pending key probe; + */ + if (unlikely(key.pending)) { + tx_key = key.pending; + if (!key.active) + goto encrypt; + if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) + goto encrypt; + if (TIPC_SKB_CB(*skb)->probe) + goto encrypt; + if (!__rx && + time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) { + tx->timer2 = jiffies; + probe = skb_clone(*skb, GFP_ATOMIC); + if (probe) { + TIPC_SKB_CB(probe)->probe = 1; + tipc_crypto_xmit(net, &probe, b, dst, __dnode); + if (probe) + b->media->send_msg(net, probe, b, dst); + } + } + } + /* Else, use the active key if any */ + if (likely(key.active)) { + tx_key = key.active; + goto encrypt; + } + goto exit; + +encrypt: + aead = tipc_aead_get(tx->aead[tx_key]); + if (unlikely(!aead)) + goto exit; + rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); + if (likely(rc > 0)) + rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); + +exit: + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) + this_cpu_inc(stats->stat[STAT_NOKEYS]); + else if (rc == -EKEYREVOKED) + this_cpu_inc(stats->stat[STAT_BADKEYS]); + kfree_skb(*skb); + *skb = NULL; + break; + } + + tipc_aead_put(aead); + return rc; +} + +/** + * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer + * @net: struct net + * @rx: RX crypto handle + * @skb: input/output message skb pointer + * @b: bearer where the message has been received + * + * If the decryption is successful, the decrypted skb is returned directly or + * as the callback, the encryption header and auth tag will be trimed out + * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). + * Otherwise, the skb will be freed! + * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX + * cluster key(s) can be taken for decryption (- recursive). + * + * Return: + * 0 : the decryption has successfully completed + * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * -ENOKEY : the decryption has failed due to no key + * -EBADMSG : the decryption has failed due to bad message + * -ENOMEM : the decryption has failed due to no memory + * < 0 : the decryption has failed due to other reasons + */ +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b) +{ + struct tipc_crypto *tx = tipc_net(net)->crypto_tx; + struct tipc_crypto_stats __percpu *stats; + struct tipc_aead *aead = NULL; + struct tipc_key key; + int rc = -ENOKEY; + u8 tx_key = 0; + + /* New peer? + * Let's try with TX key (i.e. cluster mode) & verify the skb first! + */ + if (unlikely(!rx)) + goto pick_tx; + + /* Pick RX key according to TX key, three cases are possible: + * 1) The current active key (likely) or; + * 2) The pending (new or deactivated) key (if any) or; + * 3) The passive or old active key (i.e. users > 0); + */ + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; + key = rx->key; + if (likely(tx_key == key.active)) + goto decrypt; + if (tx_key == key.pending) + goto decrypt; + if (tx_key == key.passive) { + rx->timer2 = jiffies; + if (tipc_aead_users(rx->aead[key.passive]) > 0) + goto decrypt; + } + + /* Unknown key, let's try to align RX key(s) */ + if (tipc_crypto_key_try_align(rx, tx_key)) + goto decrypt; + +pick_tx: + /* No key suitable? Try to pick one from TX... */ + aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + if (aead) + goto decrypt; + goto exit; + +decrypt: + rcu_read_lock(); + if (!aead) + aead = tipc_aead_get(rx->aead[tx_key]); + rc = tipc_aead_decrypt(net, aead, *skb, b); + rcu_read_unlock(); + +exit: + stats = ((rx) ?: tx)->stats; + switch (rc) { + case 0: + this_cpu_inc(stats->stat[STAT_OK]); + break; + case -EINPROGRESS: + case -EBUSY: + this_cpu_inc(stats->stat[STAT_ASYNC]); + *skb = NULL; + return rc; + default: + this_cpu_inc(stats->stat[STAT_NOK]); + if (rc == -ENOKEY) { + kfree_skb(*skb); + *skb = NULL; + if (rx) + tipc_node_put(rx->node); + this_cpu_inc(stats->stat[STAT_NOKEYS]); + return rc; + } else if (rc == -EBADMSG) { + this_cpu_inc(stats->stat[STAT_BADMSGS]); + } + break; + } + + tipc_crypto_rcv_complete(net, aead, b, skb, rc); + return rc; +} + +static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, + struct tipc_bearer *b, + struct sk_buff **skb, int err) +{ + struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); + struct tipc_crypto *rx = aead->crypto; + struct tipc_aead *tmp = NULL; + struct tipc_ehdr *ehdr; + struct tipc_node *n; + u8 rx_key_active; + bool destined; + + /* Is this completed by TX? */ + if (unlikely(!rx->node)) { + rx = skb_cb->tx_clone_ctx.rx; +#ifdef TIPC_CRYPTO_DEBUG + pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", + (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, + (*skb)->next, skb_cb->flags); + pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", + skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, + aead->crypto->aead[1], aead->crypto->aead[2], + aead->crypto->aead[3]); +#endif + if (unlikely(err)) { + if (err == -EBADMSG && (*skb)->next) + tipc_rcv(net, (*skb)->next, b); + goto free_skb; + } + + if (likely((*skb)->next)) { + kfree_skb((*skb)->next); + (*skb)->next = NULL; + } + ehdr = (struct tipc_ehdr *)(*skb)->data; + if (!rx) { + WARN_ON(ehdr->user != LINK_CONFIG); + n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, + true); + rx = tipc_node_crypto_rx(n); + if (unlikely(!rx)) + goto free_skb; + } + + /* Skip cloning this time as we had a RX pending key */ + if (rx->key.pending) + goto rcv; + if (tipc_aead_clone(&tmp, aead) < 0) + goto rcv; + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + tipc_aead_free(&tmp->rcu); + goto rcv; + } + tipc_aead_put(aead); + aead = tipc_aead_get(tmp); + } + + if (unlikely(err)) { + tipc_aead_users_dec(aead, INT_MIN); + goto free_skb; + } + + /* Set the RX key's user */ + tipc_aead_users_set(aead, 1); + +rcv: + /* Mark this point, RX works */ + rx->timer1 = jiffies; + + /* Remove ehdr & auth. tag prior to tipc_rcv() */ + ehdr = (struct tipc_ehdr *)(*skb)->data; + destined = ehdr->destined; + rx_key_active = ehdr->rx_key_active; + skb_pull(*skb, tipc_ehdr_size(ehdr)); + pskb_trim(*skb, (*skb)->len - aead->authsize); + + /* Validate TIPCv2 message */ + if (unlikely(!tipc_msg_validate(skb))) { + pr_err_ratelimited("Packet dropped after decryption!\n"); + goto free_skb; + } + + /* Update peer RX active key & TX users */ + if (destined) + tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb)); + + /* Mark skb decrypted */ + skb_cb->decrypted = 1; + + /* Clear clone cxt if any */ + if (likely(!skb_cb->tx_clone_deferred)) + goto exit; + skb_cb->tx_clone_deferred = 0; + memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); + goto exit; + +free_skb: + kfree_skb(*skb); + *skb = NULL; + +exit: + tipc_aead_put(aead); + if (rx) + tipc_node_put(rx->node); +} + +static void tipc_crypto_do_cmd(struct net *net, int cmd) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_crypto *tx = tn->crypto_tx, *rx; + struct list_head *p; + unsigned int stat; + int i, j, cpu; + char buf[200]; + + /* Currently only one command is supported */ + switch (cmd) { + case 0xfff1: + goto print_stats; + default: + return; + } + +print_stats: + /* Print a header */ + pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); + + /* Print key status */ + pr_info("Key status:\n"); + pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), + tipc_crypto_key_dump(tx, buf)); + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), + tipc_crypto_key_dump(rx, buf)); + } + rcu_read_unlock(); + + /* Print crypto statistics */ + for (i = 0, j = 0; i < MAX_STATS; i++) + j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); + pr_info("\nCounter %s", buf); + + memset(buf, '-', 115); + buf[115] = '\0'; + pr_info("%s\n", buf); + + j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + + rcu_read_lock(); + for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { + rx = tipc_node_crypto_rx_by_list(p); + j = scnprintf(buf, 200, "RX(%7.7s) ", + tipc_node_get_id_str(rx->node)); + for_each_possible_cpu(cpu) { + for (i = 0; i < MAX_STATS; i++) { + stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; + j += scnprintf(buf + j, 200 - j, "|%11d ", + stat); + } + pr_info("%s", buf); + j = scnprintf(buf, 200, "%12s", " "); + } + } + rcu_read_unlock(); + + pr_info("\n======================== Done ========================\n"); +} + +static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) +{ + struct tipc_key key = c->key; + struct tipc_aead *aead; + int k, i = 0; + char *s; + + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); + + rcu_read_lock(); + aead = rcu_dereference(c->aead[k]); + if (aead) + i += scnprintf(buf + i, 200 - i, + "{\"%s...\", \"%s\"}/%d:%d", + aead->hint, + (aead->mode == CLUSTER_KEY) ? "c" : "p", + atomic_read(&aead->users), + refcount_read(&aead->refcnt)); + rcu_read_unlock(); + i += scnprintf(buf + i, 200 - i, "\n"); + } + + if (c->node) + i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", + atomic_read(&c->peer_rx_active)); + + return buf; +} + +#ifdef TIPC_CRYPTO_DEBUG +static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, + char *buf) +{ + struct tipc_key *key = &old; + int k, i = 0; + char *s; + + /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ +again: + i += scnprintf(buf + i, 32 - i, "["); + for (k = KEY_MIN; k <= KEY_MAX; k++) { + if (k == key->passive) + s = "pas"; + else if (k == key->active) + s = "act"; + else if (k == key->pending) + s = "pen"; + else + s = "-"; + i += scnprintf(buf + i, 32 - i, + (k != KEY_MAX) ? "%s " : "%s", s); + } + if (key != &new) { + i += scnprintf(buf + i, 32 - i, "] -> "); + key = &new; + goto again; + } + i += scnprintf(buf + i, 32 - i, "]"); + return buf; +} +#endif diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h new file mode 100644 index 000000000000..c3de769f49e8 --- /dev/null +++ b/net/tipc/crypto.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * net/tipc/crypto.h: Include file for TIPC crypto + * + * Copyright (c) 2019, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifdef CONFIG_TIPC_CRYPTO +#ifndef _TIPC_CRYPTO_H +#define _TIPC_CRYPTO_H + +#include "core.h" +#include "node.h" +#include "msg.h" +#include "bearer.h" + +#define TIPC_EVERSION 7 + +/* AEAD aes(gcm) */ +#define TIPC_AES_GCM_KEY_SIZE_128 16 +#define TIPC_AES_GCM_KEY_SIZE_192 24 +#define TIPC_AES_GCM_KEY_SIZE_256 32 + +#define TIPC_AES_GCM_SALT_SIZE 4 +#define TIPC_AES_GCM_IV_SIZE 12 +#define TIPC_AES_GCM_TAG_SIZE 16 + +/** + * TIPC crypto modes: + * - CLUSTER_KEY: + * One single key is used for both TX & RX in all nodes in the cluster. + * - PER_NODE_KEY: + * Each nodes in the cluster has one TX key, for RX a node needs to know + * its peers' TX key for the decryption of messages from those nodes. + */ +enum { + CLUSTER_KEY = 1, + PER_NODE_KEY = (1 << 1), +}; + +extern int sysctl_tipc_max_tfms __read_mostly; + +/** + * TIPC encryption message format: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 + * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w1:| Seqno | + * w2:| (8 octets) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * w3:\ Prevnode \ + * / (4 or 16 octets) / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Encrypted complete TIPC V2 header and user data / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * | AuthTag | + * | (16 octets) | + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Word0: + * Ver : = 7 i.e. TIPC encryption message version + * User : = 7 (for LINK_PROTOCOL); = 13 (for LINK_CONFIG) or = 0 + * D : The destined bit i.e. the message's destination node is + * "known" or not at the message encryption + * TX : TX key used for the message encryption + * RX : Currently RX active key corresponding to the destination + * node's TX key (when the "D" bit is set) + * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * Rsvd : Reserved bit, field + * Word1-2: + * Seqno : The 64-bit sequence number of the encrypted message, also + * part of the nonce used for the message encryption/decryption + * Word3-: + * Prevnode: The source node address, or ID in case LINK_CONFIG only + * AuthTag : The authentication tag for the message integrity checking + * generated by the message encryption + */ +struct tipc_ehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 destined:1, + user:4, + version:3; + __u8 reserved_1:3, + keepalive:1, + rx_key_active:2, + tx_key:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:3, + user:4, + destined:1; + __u8 tx_key:2, + rx_key_active:2, + keepalive:1, + reserved_1:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be16 reserved_2; + } __packed; + __be32 w0; + }; + __be64 seqno; + union { + __be32 addr; + __u8 id[NODE_ID_LEN]; /* For a LINK_CONFIG message only! */ + }; +#define EHDR_SIZE (offsetof(struct tipc_ehdr, addr) + sizeof(__be32)) +#define EHDR_CFG_SIZE (sizeof(struct tipc_ehdr)) +#define EHDR_MIN_SIZE (EHDR_SIZE) +#define EHDR_MAX_SIZE (EHDR_CFG_SIZE) +#define EMSG_OVERHEAD (EHDR_SIZE + TIPC_AES_GCM_TAG_SIZE) +} __packed; + +int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, + struct tipc_node *node); +void tipc_crypto_stop(struct tipc_crypto **crypto); +void tipc_crypto_timeout(struct tipc_crypto *rx); +int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, + struct tipc_bearer *b, struct tipc_media_addr *dst, + struct tipc_node *__dnode); +int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, + struct sk_buff **skb, struct tipc_bearer *b); +int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, + u8 mode); +void tipc_crypto_key_flush(struct tipc_crypto *c); +int tipc_aead_key_validate(struct tipc_aead_key *ukey); +bool tipc_ehdr_validate(struct sk_buff *skb); + +#endif /* _TIPC_CRYPTO_H */ +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..b043e8c6397a 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -242,7 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, - &maddr, &respond, &dupl_addr); + msg_peer_net_hash(hdr), &maddr, &respond, + &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) diff --git a/net/tipc/link.c b/net/tipc/link.c index 999eab592de8..24d4d10756d3 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -44,6 +44,7 @@ #include "netlink.h" #include "monitor.h" #include "trace.h" +#include "crypto.h" #include <linux/pkt_sched.h> @@ -397,6 +398,15 @@ int tipc_link_mtu(struct tipc_link *l) return l->mtu; } +int tipc_link_mss(struct tipc_link *l) +{ +#ifdef CONFIG_TIPC_CRYPTO + return l->mtu - INT_H_SIZE - EMSG_OVERHEAD; +#else + return l->mtu - INT_H_SIZE; +#endif +} + u16 tipc_link_rcv_nxt(struct tipc_link *l) { return l->rcv_nxt; @@ -540,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) - tipc_bcast_disable_rcast(net); + tipc_bcast_toggle_rcast(net, false); return true; } @@ -940,16 +950,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); - unsigned int maxwin = l->window; - int imp = msg_importance(hdr); - unsigned int mtu = l->mtu; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff *skb, *_skb; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - struct sk_buff_head *transmq = &l->transmq; - struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, **tskb; int pkt_cnt = skb_queue_len(list); + int imp = msg_importance(hdr); + unsigned int mss = tipc_link_mss(l); + unsigned int maxwin = l->window; + unsigned int mtu = l->mtu; + bool new_bundle; int rc = 0; if (unlikely(msg_size(hdr) > mtu)) { @@ -975,20 +987,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, } /* Prepare each packet for sending, and add to relevant queue: */ - while (skb_queue_len(list)) { - skb = skb_peek(list); - hdr = buf_msg(skb); - msg_set_seqno(hdr, seqno); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - + while ((skb = __skb_dequeue(list))) { if (likely(skb_queue_len(transmq) < maxwin)) { + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { + kfree_skb(skb); __skb_queue_purge(list); return -ENOBUFS; } - __skb_dequeue(list); __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) @@ -1000,22 +1010,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - tskb = &l->backlog[imp].target_bskb; - if (tipc_msg_bundle(*tskb, hdr, mtu)) { - kfree_skb(__skb_dequeue(list)); - l->stats.sent_bundled++; - continue; - } - if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) { - kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, *tskb); - l->backlog[imp].len++; - l->stats.sent_bundled++; - l->stats.sent_bundles++; + if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb, + mss, l->addr, &new_bundle)) { + if (skb) { + /* Keep a ref. to the skb for next try */ + l->backlog[imp].target_bskb = skb; + l->backlog[imp].len++; + __skb_queue_tail(backlogq, skb); + } else { + if (new_bundle) { + l->stats.sent_bundles++; + l->stats.sent_bundled++; + } + l->stats.sent_bundled++; + } continue; } l->backlog[imp].target_bskb = NULL; - l->backlog[imp].len += skb_queue_len(list); + l->backlog[imp].len += (1 + skb_queue_len(list)); + __skb_queue_tail(backlogq, skb); skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; @@ -1084,7 +1097,7 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, return false; if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + - msecs_to_jiffies(r->tolerance))) + msecs_to_jiffies(r->tolerance * 10))) return false; hdr = buf_msg(skb); @@ -1151,7 +1164,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1427,8 +1440,7 @@ next_gap_ack: if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; - _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, - GFP_ATOMIC); + _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); @@ -1728,21 +1740,6 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; __skb_queue_head_init(&tnlq); - __skb_queue_head_init(&tmpxq); - __skb_queue_head_init(&frags); - - /* At least one packet required for safe algorithm => add dummy */ - skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), - 0, 0, TIPC_ERR_NO_PORT); - if (!skb) { - pr_warn("%sunable to create tunnel packet\n", link_co_err); - return; - } - __skb_queue_tail(&tnlq, skb); - tipc_link_xmit(l, &tnlq, &tmpxq); - __skb_queue_purge(&tmpxq); - /* Link Synching: * From now on, send only one single ("dummy") SYNCH message * to peer. The SYNCH message does not contain any data, just @@ -1768,6 +1765,20 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, return; } + __skb_queue_head_init(&tmpxq); + __skb_queue_head_init(&frags); + /* At least one packet required for safe algorithm => add dummy */ + skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), + 0, 0, TIPC_ERR_NO_PORT); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, &tmpxq); + __skb_queue_purge(&tmpxq); + /* Initialize reusable tunnel packet header */ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); @@ -1873,7 +1884,7 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, tipc_link_create_dummy_tnl_msg(tnl, xmitq); - /* This failover link enpoint was never established before, + /* This failover link endpoint was never established before, * so it has not received anything from peer. * Otherwise, it must be a normal failover situation or the * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes diff --git a/net/tipc/link.h b/net/tipc/link.h index adcad65e761c..c09e9d49d0a3 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -141,6 +141,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); +int tipc_link_mss(struct tipc_link *l); void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, struct sk_buff_head *xmitq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 6a6eae88442f..58708b4c7719 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -665,6 +665,21 @@ void tipc_mon_delete(struct net *net, int bearer_id) kfree(mon); } +void tipc_mon_reinit_self(struct net *net) +{ + struct tipc_monitor *mon; + int bearer_id; + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + mon = tipc_monitor(net, bearer_id); + if (!mon) + continue; + write_lock_bh(&mon->lock); + mon->self->addr = tipc_own_addr(net); + write_unlock_bh(&mon->lock); + } +} + int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index 2a21b93e0d04..ed63d2e650b0 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -77,6 +77,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id); int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node); +void tipc_mon_reinit_self(struct net *net); extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 922d262e153f..0d515d20b056 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -39,10 +39,16 @@ #include "msg.h" #include "addr.h" #include "name_table.h" +#include "crypto.h" #define MAX_FORWARD_SIZE 1024 +#ifdef CONFIG_TIPC_CRYPTO +#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) +#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_TAILROOM 16 +#endif static unsigned int align(unsigned int i) { @@ -61,7 +67,11 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; +#ifdef CONFIG_TIPC_CRYPTO + unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; +#else unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; +#endif skb = alloc_skb_fclone(buf_size, gfp); if (skb) { @@ -173,7 +183,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { - TIPC_SKB_CB(head)->validated = false; + TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; @@ -190,6 +200,59 @@ err: return 0; } +/** + * tipc_msg_append(): Append data to tail of an existing buffer queue + * @hdr: header to be used + * @m: the data to be appended + * @mss: max allowable size of buffer + * @dlen: size of data to be appended + * @txq: queue to appand to + * Returns the number og 1k blocks appended or errno value + */ +int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq) +{ + struct sk_buff *skb, *prev; + int accounted, total, curr; + int mlen, cpy, rem = dlen; + struct tipc_msg *hdr; + + skb = skb_peek_tail(txq); + accounted = skb ? msg_blocks(buf_msg(skb)) : 0; + total = accounted; + + while (rem) { + if (!skb || skb->len >= mss) { + prev = skb; + skb = tipc_buf_acquire(mss, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + skb_orphan(skb); + skb_trim(skb, MIN_H_SIZE); + hdr = buf_msg(skb); + skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); + msg_set_hdr_sz(hdr, MIN_H_SIZE); + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; + if (prev) + msg_set_ack_required(buf_msg(prev), 0); + msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); + cpy = min_t(int, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; + } + return total - accounted; +} + /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least @@ -218,6 +281,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; @@ -239,7 +303,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) if (unlikely(skb->len < msz)) return false; - TIPC_SKB_CB(skb)->validated = true; + TIPC_SKB_CB(skb)->validated = 1; return true; } @@ -419,48 +483,98 @@ error: } /** - * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @skb: the buffer to append to ("bundle") - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer - * Consumes buffer if successful - * Returns true if bundling could be performed, otherwise false + * tipc_msg_bundle - Append contents of a buffer to tail of an existing one + * @bskb: the bundle buffer to append to + * @msg: message to be appended + * @max: max allowable size for the bundle buffer + * + * Returns "true" if bundling has been performed, otherwise "false" */ -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) +static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, + u32 max) { - struct tipc_msg *bmsg; - unsigned int bsz; - unsigned int msz = msg_size(msg); - u32 start, pad; - u32 max = mtu - INT_H_SIZE; + struct tipc_msg *bmsg = buf_msg(bskb); + u32 msz, bsz, offset, pad; - if (likely(msg_user(msg) == MSG_FRAGMENTER)) - return false; - if (!skb) - return false; - bmsg = buf_msg(skb); + msz = msg_size(msg); bsz = msg_size(bmsg); - start = align(bsz); - pad = start - bsz; + offset = align(bsz); + pad = offset - bsz; - if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) + if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; - if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + if (unlikely(max < (offset + msz))) return false; - if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) + + skb_put(bskb, pad + msz); + skb_copy_to_linear_data_offset(bskb, offset, msg, msz); + msg_set_size(bmsg, offset + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + return true; +} + +/** + * tipc_msg_try_bundle - Try to bundle a new message to the last one + * @tskb: the last/target message to which the new one will be appended + * @skb: the new message skb pointer + * @mss: max message size (header inclusive) + * @dnode: destination node for the message + * @new_bundle: if this call made a new bundle or not + * + * Return: "true" if the new message skb is potential for bundling this time or + * later, in the case a bundling has been done this time, the skb is consumed + * (the skb pointer = NULL). + * Otherwise, "false" if the skb cannot be bundled at all. + */ +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle) +{ + struct tipc_msg *msg, *inner, *outer; + u32 tsz; + + /* First, check if the new buffer is suitable for bundling */ + msg = buf_msg(*skb); + if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (unlikely(skb_tailroom(skb) < (pad + msz))) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; - if (unlikely(max < (start + msz))) + if (msg_user(msg) == BCAST_PROTOCOL) return false; - if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && - (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + if (mss <= INT_H_SIZE + msg_size(msg)) return false; - skb_put(skb, pad + msz); - skb_copy_to_linear_data_offset(skb, start, msg, msz); - msg_set_size(bmsg, start + msz); - msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + /* Ok, but the last/target buffer can be empty? */ + if (unlikely(!tskb)) + return true; + + /* Is it a bundle already? Try to bundle the new message to it */ + if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { + *new_bundle = false; + goto bundle; + } + + /* Make a new bundle of the two messages if possible */ + tsz = msg_size(buf_msg(tskb)); + if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + return true; + if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, + GFP_ATOMIC))) + return true; + inner = buf_msg(tskb); + skb_push(tskb, INT_H_SIZE); + outer = buf_msg(tskb); + tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, + dnode); + msg_set_importance(outer, msg_importance(inner)); + msg_set_size(outer, INT_H_SIZE + tsz); + msg_set_msgcnt(outer, 1); + *new_bundle = true; + +bundle: + if (likely(tipc_msg_bundle(tskb, msg, mss))) { + consume_skb(*skb); + *skb = NULL; + } return true; } @@ -510,49 +624,6 @@ none: } /** - * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain, where head is the buffer to replace/append - * @skb: buffer to be created, appended to and returned in case of success - * @msg: message to be appended - * @mtu: max allowable size for the bundle buffer, inclusive header - * @dnode: destination node for message. (Not always present in header) - * Returns true if success, otherwise false - */ -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode) -{ - struct sk_buff *_skb; - struct tipc_msg *bmsg; - u32 msz = msg_size(msg); - u32 max = mtu - INT_H_SIZE; - - if (msg_user(msg) == MSG_FRAGMENTER) - return false; - if (msg_user(msg) == TUNNEL_PROTOCOL) - return false; - if (msg_user(msg) == BCAST_PROTOCOL) - return false; - if (msz > (max / 2)) - return false; - - _skb = tipc_buf_acquire(max, GFP_ATOMIC); - if (!_skb) - return false; - - skb_trim(_skb, INT_H_SIZE); - bmsg = buf_msg(_skb); - tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, - INT_H_SIZE, dnode); - msg_set_importance(bmsg, msg_importance(msg)); - msg_set_seqno(bmsg, msg_seqno(msg)); - msg_set_ack(bmsg, msg_ack(msg)); - msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(_skb, msg, mtu); - *skb = _skb; - return true; -} - -/** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0daa6f04ca81..6d466ebdb64f 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -102,16 +102,42 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { - struct sk_buff *tail; - unsigned long nxt_retr; - unsigned long retr_stamp; - u32 bytes_read; - u32 orig_member; - u16 chain_imp; - u16 ackers; - u16 retr_cnt; - bool validated; -}; + union { + struct { + struct sk_buff *tail; + unsigned long nxt_retr; + unsigned long retr_stamp; + u32 bytes_read; + u32 orig_member; + u16 chain_imp; + u16 ackers; + u16 retr_cnt; + } __packed; +#ifdef CONFIG_TIPC_CRYPTO + struct { + struct tipc_crypto *rx; + struct tipc_aead *last; + u8 recurs; + } tx_clone_ctx __packed; +#endif + } __packed; + union { + struct { + u8 validated:1; +#ifdef CONFIG_TIPC_CRYPTO + u8 encrypted:1; + u8 decrypted:1; + u8 probe:1; + u8 tx_clone_deferred:1; +#endif + }; + u8 flags; + }; + u8 reserved; +#ifdef CONFIG_TIPC_CRYPTO + void *crypto_ctx; +#endif +} __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) @@ -290,6 +316,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline int msg_ack_required(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); @@ -1026,6 +1062,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); @@ -1057,14 +1107,15 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, - u32 mtu, u32 dnode); +bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, + u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); +int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, + int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 836e629e8f4a..5feaf3b67380 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..92d04dc2a44b 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,7 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -66,6 +67,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +76,7 @@ struct service_range { */ struct tipc_service { u32 type; + u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -244,6 +248,8 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -278,6 +284,20 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, } /** + * Code reused: time_after32() for the same purpose + */ +#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + +/** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range @@ -286,36 +306,51 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; + INIT_LIST_HEAD(&publ_list); for (n = rb_first(&service->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower > ns.upper) break; if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) continue; - first = true; + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..728bc7016c38 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + u32 id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; diff --git a/net/tipc/net.c b/net/tipc/net.c index 85707c185360..2de3cec9929d 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -42,6 +42,7 @@ #include "node.h" #include "bcast.h" #include "netlink.h" +#include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -136,6 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); + tipc_mon_reinit_self(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index d6165ad384c0..e53231bd23b4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -102,7 +102,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_ID] = { .type = NLA_BINARY, + .len = TIPC_NODEID_LEN}, + [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, + .len = TIPC_AEAD_KEY_SIZE_MAX}, }; /* Properties valid for media, bearer and link */ @@ -176,7 +180,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_PUBL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_publ_dump, }, { @@ -239,7 +244,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_MON_PEER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_nl_node_dump_monitor_peer, }, { @@ -250,10 +256,23 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #ifdef CONFIG_TIPC_MEDIA_UDP { .cmd = TIPC_NL_UDP_GET_REMOTEIP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = tipc_udp_nl_dump_remoteip, }, #endif +#ifdef CONFIG_TIPC_CRYPTO + { + .cmd = TIPC_NL_KEY_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_set_key, + }, + { + .cmd = TIPC_NL_KEY_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_node_flush_key, + }, +#endif }; struct genl_family tipc_genl_family __ro_after_init = { @@ -268,18 +287,6 @@ struct genl_family tipc_genl_family __ro_after_init = { .n_ops = ARRAY_SIZE(tipc_genl_v2_ops), }; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) -{ - u32 maxattr = tipc_genl_family.maxattr; - - *attr = genl_family_attrbuf(&tipc_genl_family); - if (!*attr) - return -EOPNOTSUPP; - - return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr, - tipc_nl_policy, NULL); -} - int __init tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 4ba0ad422110..7cf777723e3e 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -38,7 +38,6 @@ #include <net/netlink.h> extern struct genl_family tipc_genl_family; -int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); struct tipc_nl_msg { struct sk_buff *skb; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e135d4e11231..17a529739f8d 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -181,15 +181,18 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) { + struct genl_dumpit_info info; int len = 0; int err; struct sk_buff *buf; struct nlmsghdr *nlmsg; struct netlink_callback cb; + struct nlattr **attrbuf; memset(&cb, 0, sizeof(cb)); cb.nlh = (struct nlmsghdr *)arg->data; cb.skb = arg; + cb.data = &info; buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!buf) @@ -201,19 +204,35 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) { + err = -ENOMEM; + goto err_out; + } + + info.attrs = attrbuf; + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + do { int rem; len = (*cmd->dumpit)(buf, &cb); nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) { - struct nlattr **attrs; - - err = tipc_nlmsg_parse(nlmsg, &attrs); + err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN, + attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, + NULL); if (err) goto err_out; - err = (*cmd->format)(msg, attrs); + err = (*cmd->format)(msg, attrbuf); if (err) goto err_out; @@ -231,6 +250,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, err = 0; err_out: + kfree(attrbuf); tipc_dump_done(&cb); kfree_skb(buf); diff --git a/net/tipc/node.c b/net/tipc/node.c index c8f6177dd5a2..ab04e00cb95b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -44,6 +44,7 @@ #include "discover.h" #include "netlink.h" #include "trace.h" +#include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 @@ -89,6 +90,7 @@ struct tipc_bclink_entry { * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node + * @preliminary: a preliminary node or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -99,6 +101,7 @@ struct tipc_bclink_entry { * @publ_list: list of publications * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; @@ -112,6 +115,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; @@ -120,12 +124,18 @@ struct tipc_node { u32 signature; u32 link_id; u8 peer_id[16]; + char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_crypto *crypto_rx; +#endif }; /* Node FSM states and events: @@ -163,7 +173,6 @@ static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); -static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); @@ -184,7 +193,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +203,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -235,15 +252,51 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } +u32 tipc_node_get_addr(struct tipc_node *node) +{ + return (node) ? node->addr : 0; +} + +char *tipc_node_get_id_str(struct tipc_node *node) +{ + return node->peer_id_string; +} + +#ifdef CONFIG_TIPC_CRYPTO +/** + * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * Note: node ref counter must be held first! + */ +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) +{ + return (__n) ? __n->crypto_rx : NULL; +} + +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) +{ + return container_of(pos, struct tipc_node, list)->crypto_rx; +} +#endif + +void tipc_node_free(struct rcu_head *rp) +{ + struct tipc_node *n = container_of(rp, struct tipc_node, rcu); + +#ifdef CONFIG_TIPC_CRYPTO + tipc_crypto_stop(&n->crypto_rx); +#endif + kfree(n); +} + static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); - kfree_rcu(n, rcu); + call_rcu(&n->rcu, tipc_node_free); } -static void tipc_node_put(struct tipc_node *node) +void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } @@ -264,7 +317,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { - if (node->addr != addr) + if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; @@ -360,18 +413,71 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; struct tipc_link *l; + unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); - n = tipc_node_find(net, addr); + n = tipc_node_find(net, addr) ?: + tipc_node_find_by_id(net, peer_id); if (n) { + if (!n->preliminary) + goto update; + if (preliminary) + goto exit; + /* A preliminary node becomes "real" now, refresh its data */ + tipc_node_write_lock(n); + n->preliminary = false; + n->addr = addr; + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, + &tn->node_htable[tipc_hashfn(addr)]); + list_del_rcu(&n->list); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); + tipc_node_write_unlock_fast(n); + +update: + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +495,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + + tipc_bcast_toggle_rcast(net, + (tn->capabilities & TIPC_BCAST_RCAST)); + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -396,9 +506,23 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, pr_warn("Node creation failed, no memory\n"); goto exit; } + tipc_nodeid2string(n->peer_id_string, peer_id); +#ifdef CONFIG_TIPC_CRYPTO + if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { + pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); + kfree(n); + n = NULL; + goto exit; + } +#endif n->addr = addr; + n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -417,22 +541,14 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, - tipc_link_window(tipc_bc_sndlink(net)), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, - tipc_bc_sndlink(net), - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no memory\n"); - kfree(n); - n = NULL; - goto exit; - } + n->bc_entry.link = NULL; tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); - n->keepalive_intv = U32_MAX; + /* Start a slow timer anyway, crypto needs it */ + n->keepalive_intv = 10000; + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) + tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) @@ -444,6 +560,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -617,12 +734,18 @@ static bool tipc_node_cleanup(struct tipc_node *peer) } tipc_node_write_unlock(peer); + if (!deleted) { + spin_unlock_bh(&tn->node_list_lock); + return deleted; + } + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } - + tipc_bcast_toggle_rcast(peer->net, + (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -645,6 +768,10 @@ static void tipc_node_timeout(struct timer_list *t) return; } +#ifdef CONFIG_TIPC_CRYPTO + /* Take any crypto key related actions first */ + tipc_crypto_timeout(n->crypto_rx); +#endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be @@ -665,7 +792,7 @@ static void tipc_node_timeout(struct timer_list *t) remains--; } tipc_node_read_unlock(n); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } @@ -697,7 +824,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE; + n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); @@ -751,7 +878,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } @@ -906,7 +1033,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } @@ -950,6 +1077,8 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; + bool preliminary; + u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); @@ -965,9 +1094,11 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { - addr = n->addr; + sugg_addr = n->addr; + preliminary = n->preliminary; tipc_node_put(n); - return addr; + if (!preliminary) + return sugg_addr; } /* Even this node may be in conflict */ @@ -979,12 +1110,12 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l; + struct tipc_link *l, *snd_l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -998,11 +1129,27 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, + false); if (!n) return; tipc_node_write_lock(n); + if (unlikely(!n->bc_entry.link)) { + snd_l = tipc_bc_sndlink(net); + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, + tipc_link_window(snd_l), + n->capabilities, + &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, + &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no mem\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + return; + } + } le = &n->links[b->identity]; @@ -1017,6 +1164,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (sign_match && addr_match && link_up) { /* All is fine. Do nothing. */ reset = false; + /* Peer node is not a container/local namespace */ + if (!n->peer_hash_mix) + n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -1342,7 +1492,8 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; - + n->peer_net = NULL; + n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1575,56 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,6 +1640,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; int bearer_id; int rc; @@ -1456,6 +1658,17 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, } tipc_node_read_lock(n); + node_up = node_is_up(n); + if (node_up && n->peer_net && check_net(n->peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(n->peer_net, list); + if (likely(skb_queue_empty(list))) { + tipc_node_read_unlock(n); + tipc_node_put(n); + return 0; + } + } + bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); @@ -1474,7 +1687,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); @@ -1622,7 +1835,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id } if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); @@ -1800,20 +2013,38 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; - struct tipc_node *n; + struct tipc_link_entry *le; struct tipc_msg *hdr; + struct tipc_node *n; int bearer_id = b->identity; - struct tipc_link_entry *le; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; +#ifdef CONFIG_TIPC_CRYPTO + struct tipc_ehdr *ehdr; - __skb_queue_head_init(&xmitq); + /* Check if message must be decrypted first */ + if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) + goto rcv; + + ehdr = (struct tipc_ehdr *)skb->data; + if (likely(ehdr->user != LINK_CONFIG)) { + n = tipc_node_find(net, ntohl(ehdr->addr)); + if (unlikely(!n)) + goto discard; + } else { + n = tipc_node_find_by_id(net, ehdr->id); + } + tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); + if (!skb) + return; +rcv: +#endif /* Ensure message is well-formed before touching the header */ - TIPC_SKB_CB(skb)->validated = false; if (unlikely(!tipc_msg_validate(&skb))) goto discard; + __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); @@ -1884,7 +2115,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); discard: @@ -1915,7 +2146,7 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, tipc_link_set_mtu(e->link, b->mtu); } tipc_node_write_unlock(n); - tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); @@ -1926,7 +2157,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; - struct tipc_node *peer; + struct tipc_node *peer, *temp_node; u32 addr; int err; @@ -1967,6 +2198,12 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) tipc_node_write_unlock(peer); tipc_node_delete(peer); + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); @@ -2011,6 +2248,8 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry_rcu(node, &tn->node_list, list) { + if (node->preliminary) + continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; @@ -2150,7 +2389,8 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, + NULL); return res; } @@ -2484,13 +2724,9 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_MON]) return -EINVAL; @@ -2530,11 +2766,141 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, return skb->len; } -u32 tipc_node_get_addr(struct tipc_node *node) +#ifdef CONFIG_TIPC_CRYPTO +static int tipc_nl_retrieve_key(struct nlattr **attrs, + struct tipc_aead_key **key) { - return (node) ? node->addr : 0; + struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; + + if (!attr) + return -ENODATA; + + *key = (struct tipc_aead_key *)nla_data(attr); + if (nla_len(attr) < tipc_aead_key_size(*key)) + return -EINVAL; + + return 0; +} + +static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; + + if (!attr) + return -ENODATA; + + if (nla_len(attr) < TIPC_NODEID_LEN) + return -EINVAL; + + *node_id = (u8 *)nla_data(attr); + return 0; +} + +int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n = NULL; + struct tipc_aead_key *ukey; + struct tipc_crypto *c; + u8 *id, *own_id; + int rc = 0; + + if (!info->attrs[TIPC_NLA_NODE]) + return -EINVAL; + + rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, + info->attrs[TIPC_NLA_NODE], + tipc_nl_node_policy, info->extack); + if (rc) + goto exit; + + own_id = tipc_own_id(net); + if (!own_id) { + rc = -EPERM; + goto exit; + } + + rc = tipc_nl_retrieve_key(attrs, &ukey); + if (rc) + goto exit; + + rc = tipc_aead_key_validate(ukey); + if (rc) + goto exit; + + rc = tipc_nl_retrieve_nodeid(attrs, &id); + switch (rc) { + case -ENODATA: + /* Cluster key mode */ + rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY); + break; + case 0: + /* Per-node key mode */ + if (!memcmp(id, own_id, NODE_ID_LEN)) { + c = tn->crypto_tx; + } else { + n = tipc_node_find_by_id(net, id) ?: + tipc_node_create(net, 0, id, 0xffffu, 0, true); + if (unlikely(!n)) { + rc = -ENOMEM; + break; + } + c = n->crypto_rx; + } + + rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY); + if (n) + tipc_node_put(n); + break; + default: + break; + } + +exit: + return (rc < 0) ? rc : 0; +} + +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_set_key(skb, info); + rtnl_unlock(); + + return err; +} + +int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + tipc_crypto_key_flush(tn->crypto_tx); + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) + tipc_crypto_key_flush(n->crypto_rx); + rcu_read_unlock(); + + pr_info("All keys are flushed!\n"); + return 0; } +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_node_flush_key(skb, info); + rtnl_unlock(); + + return err; +} +#endif + /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped @@ -2591,3 +2957,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 291d0ecd4101..a6803b449a2c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -54,7 +54,8 @@ enum { TIPC_LINK_PROTO_SEQNO = (1 << 6), TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), - TIPC_TUNNEL_ENHANCED = (1 << 9) + TIPC_TUNNEL_ENHANCED = (1 << 9), + TIPC_NAGLE = (1 << 10) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -66,16 +67,27 @@ enum { TIPC_LINK_PROTO_SEQNO | \ TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ - TIPC_TUNNEL_ENHANCED) + TIPC_TUNNEL_ENHANCED | \ + TIPC_NAGLE) + #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_get_addr(struct tipc_node *node); +char *tipc_node_get_id_str(struct tipc_node *node); +void tipc_node_put(struct tipc_node *node); +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, + u16 capabilities, u32 hash_mixes, + bool preliminary); +#ifdef CONFIG_TIPC_CRYPTO +struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n); +struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos); +#endif u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -92,7 +104,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -107,4 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +#ifdef CONFIG_TIPC_CRYPTO +int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info); +#endif +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 4b92b196cfa6..a1c8d722ca20 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -75,6 +75,7 @@ struct sockaddr_pair { * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links @@ -97,6 +98,7 @@ struct tipc_sock { u32 conn_instance; int published; u32 max_pkt; + u32 maxnagle; u32 portid; struct tipc_msg phdr; struct list_head cong_links; @@ -116,6 +118,10 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + u32 oneway; + u16 snd_backlog; + bool expect_ack; + bool nodelay; bool group_is_open; }; @@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +static void tipc_sk_push_backlog(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) return 1; } +/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle + */ +static void tsk_set_nagle(struct tipc_sock *tsk) +{ + struct sock *sk = &tsk->sk; + + tsk->maxnagle = 0; + if (sk->sk_type != SOCK_STREAM) + return; + if (tsk->nodelay) + return; + if (!(tsk->peer_caps & TIPC_NAGLE)) + return; + /* Limit node local buffer size to avoid receive queue overflow */ + if (tsk->max_pkt == MAX_MSG_SIZE) + tsk->maxnagle = 1500; + else + tsk->maxnagle = tsk->max_pkt; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -512,8 +540,12 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Remove any pending SYN message */ - __skb_queue_purge(&sk->sk_write_queue); + /* Push out unsent messages or remove if pending SYN */ + skb = skb_peek(&sk->sk_write_queue); + if (skb && !msg_is_syn(buf_msg(skb))) + tipc_sk_push_backlog(tsk); + else + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -854,7 +886,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1208,6 +1240,27 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_sk_rcv(net, inputq); } +/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +static void tipc_sk_push_backlog(struct tipc_sock *tsk) +{ + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); + int rc; + + if (skb_queue_empty(txq) || tsk->cong_link_cnt) + return; + + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; + tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +} + /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket @@ -1221,7 +1274,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); - bool conn_cong; + bool was_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) { @@ -1254,11 +1307,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { - conn_cong = tsk_conn_cong(tsk); + was_cong = tsk_conn_cong(tsk); + tsk->expect_ack = false; + tipc_sk_push_backlog(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); - if (conn_cong) + if (was_cong && !tsk_conn_cong(tsk)) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); @@ -1388,7 +1443,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return rc; __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1437,15 +1492,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct sk_buff_head *txq = &sk->sk_write_queue; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; int send, sent = 0; - int rc = 0; - - __skb_queue_head_init(&pkts); + int blocks, rc = 0; if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; @@ -1467,21 +1522,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tipc_sk_connected(sk))); if (unlikely(rc)) break; - send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); - if (unlikely(rc != send)) - break; - - trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + blocks = tsk->snd_backlog; + if (tsk->oneway++ >= 4 && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; + } + tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) + break; + blocks += tsk_inc(tsk, send + MIN_H_SIZE); + } + trace_tipc_sk_sendstream(sk, skb_peek(txq), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + tsk->snt_unacked += blocks; + tsk->snd_backlog = 0; sent += send; } } while (sent < dlen && !rc); @@ -1526,8 +1595,9 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + tsk_set_nagle(tsk); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1848,6 +1918,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; + bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1892,6 +1963,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { + ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -1919,7 +1991,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -1990,6 +2062,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; + tipc_sk_push_backlog(tsk); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2029,6 +2102,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (unlikely(msg_mcast(hdr))) return false; + tsk->oneway = 0; switch (sk->sk_state) { case TIPC_CONNECTING: @@ -2074,6 +2148,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; return false; case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) + tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) return true; @@ -2804,7 +2880,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) struct tipc_sock *tsk; rcu_read_lock(); - tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); + tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); @@ -2959,6 +3035,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: + case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) @@ -3007,6 +3084,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; + case TIPC_NODELAY: + tsk->nodelay = !!value; + tsk_set_nagle(tsk); + break; default: res = -EINVAL; } @@ -3588,13 +3669,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 6159d327db76..58ab3d6dcdce 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -35,6 +35,7 @@ #include "core.h" #include "trace.h" +#include "crypto.h" #include <linux/sysctl.h> @@ -64,6 +65,16 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_TIPC_CRYPTO + { + .procname = "max_tfms", + .data = &sysctl_tipc_max_tfms, + .maxlen = sizeof(sysctl_tipc_max_tfms), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif {} }; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 287df68721df..86aaa4d3e781 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -372,6 +372,7 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto out; if (b && test_bit(0, &b->up)) { + TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } @@ -448,15 +449,11 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; - struct nlattr **attrs; char *bname; - err = tipc_nlmsg_parse(cb->nlh, &attrs); - if (err) - return err; - if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index e4328b3b72eb..61ec78521a60 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -26,3 +26,13 @@ config TLS_DEVICE Enable kernel support for HW offload of the TLS protocol. If unsure, say N. + +config TLS_TOE + bool "Transport Layer Security TCP stack bypass" + depends on TLS + default n + help + Enable kernel support for legacy HW offload of the TLS protocol, + which is incompatible with the Linux networking stack semantics. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index ef0dc74ce8f9..f1ffbfe8968d 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -3,8 +3,11 @@ # Makefile for the TLS subsystem. # +CFLAGS_trace.o := -I$(src) + obj-$(CONFIG_TLS) += tls.o -tls-y := tls_main.o tls_sw.o +tls-y := tls_main.o tls_sw.o tls_proc.o trace.o +tls-$(CONFIG_TLS_TOE) += tls_toe.o tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 683d00837693..0683788bbef0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -38,6 +38,8 @@ #include <net/tcp.h> #include <net/tls.h> +#include "trace.h" + /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ @@ -202,6 +204,15 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); + WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); +} +EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); + static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { @@ -216,6 +227,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, rcd_sn = tls_ctx->tx.rec_seq; + trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = tls_ctx->netdev; if (netdev) @@ -419,7 +431,7 @@ static int tls_push_data(struct sock *sk, ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) return -ENOTSUPP; - if (sk->sk_err) + if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; @@ -440,9 +452,8 @@ static int tls_push_data(struct sock *sk, max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { - rc = tls_do_allocation(sk, ctx, pfrag, - prot->prepend_size); - if (rc) { + rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); + if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; @@ -645,15 +656,19 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) return; + + trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); netdev = READ_ONCE(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) @@ -661,8 +676,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; - u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -691,8 +706,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ - if (tcp_inq(sk) > rcd_len) + sock_data = tcp_inq(sk); + if (sock_data > rcd_len) { + trace_tls_device_rx_resync_nh_delay(sk, sock_data, + rcd_len); return; + } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; @@ -736,6 +755,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { + trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -834,9 +854,9 @@ free_buf: return err; } -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { - struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); int is_decrypted = skb->decrypted; int is_encrypted = !is_decrypted; @@ -848,6 +868,10 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) is_encrypted &= !skb_iter->decrypted; } + trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, + tls_ctx->rx.rec_seq, rxm->full_len, + is_encrypted, is_decrypted); + ctx->sw.decrypted |= is_decrypted; /* Return immediately if the record is either entirely plaintext or @@ -1021,6 +1045,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, + tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; @@ -1057,6 +1083,7 @@ free_marker_record: int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { + struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; @@ -1104,6 +1131,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); + info = (void *)&ctx->crypto_recv.info; + trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, + tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index f874cc0da45d..bdca31ffe6da 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -41,7 +41,9 @@ #include <linux/inetdevice.h> #include <linux/inet_diag.h> +#include <net/snmp.h> #include <net/tls.h> +#include <net/tls_toe.h> MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); @@ -58,14 +60,12 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); -static LIST_HEAD(device_list); -static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base); -static void update_sk_prot(struct sock *sk, struct tls_context *ctx) +void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; @@ -287,14 +287,19 @@ static void tls_sk_proto_cleanup(struct sock *sk, kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_release_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } - if (ctx->rx_conf == TLS_SW) + if (ctx->rx_conf == TLS_SW) { tls_sw_release_resources_rx(sk); - else if (ctx->rx_conf == TLS_HW) + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } else if (ctx->rx_conf == TLS_HW) { tls_device_offload_cleanup_rx(sk); + TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } } static void tls_sk_proto_close(struct sock *sk, long timeout) @@ -535,19 +540,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, if (tx) { rc = tls_set_device_offload(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 1); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); conf = TLS_SW; } } else { rc = tls_set_device_offload_rx(sk, ctx); conf = TLS_HW; - if (rc) { + if (!rc) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); + } else { rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto err_crypto_info; + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); conf = TLS_SW; } tls_sw_strparser_arm(sk, ctx); @@ -604,7 +619,7 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } -static struct tls_context *create_ctx(struct sock *sk) +struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; @@ -645,90 +660,6 @@ static void tls_build_proto(struct sock *sk) } } -static void tls_hw_sk_destruct(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - ctx->sk_destruct(sk); - /* Free ctx */ - rcu_assign_pointer(icsk->icsk_ulp_data, NULL); - tls_ctx_free(sk, ctx); -} - -static int tls_hw_prot(struct sock *sk) -{ - struct tls_context *ctx; - struct tls_device *dev; - int rc = 0; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->feature && dev->feature(dev)) { - ctx = create_ctx(sk); - if (!ctx) - goto out; - - spin_unlock_bh(&device_spinlock); - tls_build_proto(sk); - ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_hw_sk_destruct; - ctx->rx_conf = TLS_HW_RECORD; - ctx->tx_conf = TLS_HW_RECORD; - update_sk_prot(sk, ctx); - spin_lock_bh(&device_spinlock); - rc = 1; - break; - } - } -out: - spin_unlock_bh(&device_spinlock); - return rc; -} - -static void tls_hw_unhash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->unhash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - dev->unhash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - ctx->sk_proto->unhash(sk); -} - -static int tls_hw_hash(struct sock *sk) -{ - struct tls_context *ctx = tls_get_ctx(sk); - struct tls_device *dev; - int err; - - err = ctx->sk_proto->hash(sk); - spin_lock_bh(&device_spinlock); - list_for_each_entry(dev, &device_list, dev_list) { - if (dev->hash) { - kref_get(&dev->kref); - spin_unlock_bh(&device_spinlock); - err |= dev->hash(dev, sk); - kref_put(&dev->kref, dev->release); - spin_lock_bh(&device_spinlock); - } - } - spin_unlock_bh(&device_spinlock); - - if (err) - tls_hw_unhash(sk); - return err; -} - static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], struct proto *base) { @@ -766,10 +697,11 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif - +#ifdef CONFIG_TLS_TOE prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash; +#endif } static int tls_init(struct sock *sk) @@ -777,8 +709,12 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; - if (tls_hw_prot(sk)) + tls_build_proto(sk); + +#ifdef CONFIG_TLS_TOE + if (tls_toe_bypass(sk)) return 0; +#endif /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. @@ -789,11 +725,9 @@ static int tls_init(struct sock *sk) if (sk->sk_state != TCP_ESTABLISHED) return -ENOTSUPP; - tls_build_proto(sk); - /* allocate tls context */ write_lock_bh(&sk->sk_callback_lock); - ctx = create_ctx(sk); + ctx = tls_ctx_create(sk); if (!ctx) { rc = -ENOMEM; goto out; @@ -879,21 +813,34 @@ static size_t tls_get_info_size(const struct sock *sk) return size; } -void tls_register_device(struct tls_device *device) +static int __net_init tls_init_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_add_tail(&device->dev_list, &device_list); - spin_unlock_bh(&device_spinlock); + int err; + + net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib); + if (!net->mib.tls_statistics) + return -ENOMEM; + + err = tls_proc_init(net); + if (err) + goto err_free_stats; + + return 0; +err_free_stats: + free_percpu(net->mib.tls_statistics); + return err; } -EXPORT_SYMBOL(tls_register_device); -void tls_unregister_device(struct tls_device *device) +static void __net_exit tls_exit_net(struct net *net) { - spin_lock_bh(&device_spinlock); - list_del(&device->dev_list); - spin_unlock_bh(&device_spinlock); + tls_proc_fini(net); + free_percpu(net->mib.tls_statistics); } -EXPORT_SYMBOL(tls_unregister_device); + +static struct pernet_operations tls_proc_ops = { + .init = tls_init_net, + .exit = tls_exit_net, +}; static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", @@ -906,6 +853,12 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { + int err; + + err = register_pernet_subsys(&tls_proc_ops); + if (err) + return err; + tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked, @@ -920,6 +873,7 @@ static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); tls_device_cleanup(); + unregister_pernet_subsys(&tls_proc_ops); } module_init(tls_register); diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c new file mode 100644 index 000000000000..3a5dd1e07233 --- /dev/null +++ b/net/tls/tls_proc.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/snmp.h> +#include <net/tls.h> + +#ifdef CONFIG_PROC_FS +static const struct snmp_mib tls_mib_list[] = { + SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), + SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), + SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), + SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), + SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), + SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), + SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), + SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), + SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), + SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), + SNMP_MIB_SENTINEL +}; + +static int tls_statistics_seq_show(struct seq_file *seq, void *v) +{ + unsigned long buf[LINUX_MIB_TLSMAX] = {}; + struct net *net = seq->private; + int i; + + snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); + for (i = 0; tls_mib_list[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); + + return 0; +} +#endif + +int __net_init tls_proc_init(struct net *net) +{ + if (!proc_create_net_single("tls_stat", 0444, net->proc_net, + tls_statistics_seq_show, NULL)) + return -ENOMEM; + return 0; +} + +void __net_exit tls_proc_fini(struct net *net) +{ + remove_proc_entry("tls_stat", net->proc_net); +} diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 319735d5c084..da9f9ce51e7b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) /* Propagate if there was an err */ if (err) { + if (err == -EBADMSG) + TLS_INC_STATS(sock_net(skb->sk), + LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(skb->sk, err); } else { @@ -253,6 +256,8 @@ static int tls_do_decryption(struct sock *sk, return ret; ret = crypto_wait_req(ret, &ctx->async_wait); + } else if (ret == -EBADMSG) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); } if (async) @@ -1492,7 +1497,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, if (!ctx->decrypted) { if (tls_ctx->rx_conf == TLS_HW) { - err = tls_device_decrypted(sk, skb); + err = tls_device_decrypted(sk, tls_ctx, skb, rxm); if (err < 0) return err; } @@ -1520,7 +1525,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - ctx->decrypted = true; + ctx->decrypted = 1; ctx->saved_data_ready(sk); } else { *zc = false; @@ -1930,7 +1935,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, tls_err_abort(sk, EBADMSG); goto splice_read_end; } - ctx->decrypted = true; + ctx->decrypted = 1; } rxm = strp_msg(skb); @@ -2031,7 +2036,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - ctx->decrypted = false; + ctx->decrypted = 0; ctx->recv_pkt = skb; strp_pause(strp); @@ -2387,10 +2392,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); if (crypto_info->version == TLS_1_3_VERSION) - sw_ctx_rx->async_capable = false; + sw_ctx_rx->async_capable = 0; else sw_ctx_rx->async_capable = - tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + !!(tfm->__crt_alg->cra_flags & + CRYPTO_ALG_ASYNC); /* Set up strparser */ memset(&cb, 0, sizeof(cb)); diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c new file mode 100644 index 000000000000..7e1330f19165 --- /dev/null +++ b/net/tls/tls_toe.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <net/inet_connection_sock.h> +#include <net/tls.h> +#include <net/tls_toe.h> + +static LIST_HEAD(device_list); +static DEFINE_SPINLOCK(device_spinlock); + +static void tls_toe_sk_destruct(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx = tls_get_ctx(sk); + + ctx->sk_destruct(sk); + /* Free ctx */ + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tls_ctx_free(sk, ctx); +} + +int tls_toe_bypass(struct sock *sk) +{ + struct tls_toe_device *dev; + struct tls_context *ctx; + int rc = 0; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = tls_ctx_create(sk); + if (!ctx) + goto out; + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_toe_sk_destruct; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + spin_unlock_bh(&device_spinlock); + return rc; +} + +void tls_toe_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + dev->unhash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + ctx->sk_proto->unhash(sk); +} + +int tls_toe_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_toe_device *dev; + int err; + + err = ctx->sk_proto->hash(sk); + spin_lock_bh(&device_spinlock); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); + err |= dev->hash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } + } + spin_unlock_bh(&device_spinlock); + + if (err) + tls_toe_unhash(sk); + return err; +} + +void tls_toe_register_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_add_tail(&device->dev_list, &device_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_register_device); + +void tls_toe_unregister_device(struct tls_toe_device *device) +{ + spin_lock_bh(&device_spinlock); + list_del(&device->dev_list); + spin_unlock_bh(&device_spinlock); +} +EXPORT_SYMBOL(tls_toe_unregister_device); diff --git a/net/tls/trace.c b/net/tls/trace.c new file mode 100644 index 000000000000..e374913cf9c9 --- /dev/null +++ b/net/tls/trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/tls/trace.h b/net/tls/trace.h new file mode 100644 index 000000000000..9ba5f600ea43 --- /dev/null +++ b/net/tls/trace.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tls + +#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _TLS_TRACE_H_ + +#include <asm/unaligned.h> +#include <linux/tracepoint.h> + +struct sock; + +TRACE_EVENT(tls_device_offload_set, + + TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret), + + TP_ARGS(sk, dir, tcp_seq, rec_no, ret), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( int, dir ) + __field( u32, tcp_seq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->dir = dir; + __entry->tcp_seq = tcp_seq; + __entry->ret = ret; + ), + + TP_printk( + "sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d", + __entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no, + __entry->ret + ) +); + +TRACE_EVENT(tls_device_decrypted, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, u32 rec_len, + bool encrypted, bool decrypted), + + TP_ARGS(sk, tcp_seq, rec_no, rec_len, encrypted, decrypted), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( u32, rec_len ) + __field( bool, encrypted ) + __field( bool, decrypted ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->rec_len = rec_len; + __entry->encrypted = encrypted; + __entry->decrypted = decrypted; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu len=%u encrypted=%d decrypted=%d", + __entry->sk, __entry->tcp_seq, + __entry->rec_no, __entry->rec_len, + __entry->encrypted, __entry->decrypted + ) +); + +TRACE_EVENT(tls_device_rx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type), + + TP_ARGS(sk, tcp_seq, rec_no, sync_type), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + __field( int, sync_type ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + __entry->sync_type = sync_type; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu sync_type=%d", + __entry->sk, __entry->tcp_seq, __entry->rec_no, + __entry->sync_type + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_schedule, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + ), + + TP_fast_assign( + __entry->sk = sk; + ), + + TP_printk( + "sk=%p", __entry->sk + ) +); + +TRACE_EVENT(tls_device_rx_resync_nh_delay, + + TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len), + + TP_ARGS(sk, sock_data, rec_len), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, sock_data ) + __field( u32, rec_len ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->sock_data = sock_data; + __entry->rec_len = rec_len; + ), + + TP_printk( + "sk=%p sock_data=%u rec_len=%u", + __entry->sk, __entry->sock_data, __entry->rec_len + ) +); + +TRACE_EVENT(tls_device_tx_resync_req, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq), + + TP_ARGS(sk, tcp_seq, exp_tcp_seq), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u32, tcp_seq ) + __field( u32, exp_tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->tcp_seq = tcp_seq; + __entry->exp_tcp_seq = exp_tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u exp_tcp_seq=%u", + __entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq + ) +); + +TRACE_EVENT(tls_device_tx_resync_send, + + TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no), + + TP_ARGS(sk, tcp_seq, rec_no), + + TP_STRUCT__entry( + __field( struct sock *, sk ) + __field( u64, rec_no ) + __field( u32, tcp_seq ) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->rec_no = get_unaligned_be64(rec_no); + __entry->tcp_seq = tcp_seq; + ), + + TP_printk( + "sk=%p tcp_seq=%u rec_no=%llu", + __entry->sk, __entry->tcp_seq, __entry->rec_no + ) +); + +#endif /* _TLS_TRACE_H_ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include <trace/define_trace.h> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0d8da809bea2..193cba2d777b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -284,11 +284,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) - goto found; + return s; } - s = NULL; -found: - return s; + return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 582a3e4dfce2..74db4cd637a7 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -126,19 +126,18 @@ static struct proto vsock_proto = { */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) -static const struct vsock_transport *transport; +#define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) +#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 + +/* Transport used for host->guest communication */ +static const struct vsock_transport *transport_h2g; +/* Transport used for guest->host communication */ +static const struct vsock_transport *transport_g2h; +/* Transport used for DGRAM communication */ +static const struct vsock_transport *transport_dgram; static DEFINE_MUTEX(vsock_register_mutex); -/**** EXPORTS ****/ - -/* Get the ID of the local context. This is transport dependent. */ - -int vm_sockets_get_local_cid(void) -{ - return transport->get_local_cid(); -} -EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); - /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected @@ -188,7 +187,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk) return __vsock_bind(sk, &local_addr); } -static int __init vsock_init_tables(void) +static void vsock_init_tables(void) { int i; @@ -197,7 +196,6 @@ static int __init vsock_init_tables(void) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); - return 0; } static void __vsock_insert_bound(struct list_head *list, @@ -230,9 +228,15 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; - list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) - if (addr->svm_port == vsk->local_addr.svm_port) + list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { + if (vsock_addr_equals_addr(addr, &vsk->local_addr)) + return sk_vsock(vsk); + + if (addr->svm_port == vsk->local_addr.svm_port && + (vsk->local_addr.svm_cid == VMADDR_CID_ANY || + addr->svm_cid == VMADDR_CID_ANY)) return sk_vsock(vsk); + } return NULL; } @@ -382,6 +386,88 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); +static void vsock_deassign_transport(struct vsock_sock *vsk) +{ + if (!vsk->transport) + return; + + vsk->transport->destruct(vsk); + module_put(vsk->transport->module); + vsk->transport = NULL; +} + +/* Assign a transport to a socket and call the .init transport callback. + * + * Note: for stream socket this must be called when vsk->remote_addr is set + * (e.g. during the connect() or when a connection request on a listener + * socket is received). + * The vsk->remote_addr is used to decide which transport to use: + * - remote CID <= VMADDR_CID_HOST will use guest->host transport; + * - remote CID == local_cid (guest->host transport) will use guest->host + * transport for loopback (host->guest transports don't support loopback); + * - remote CID > VMADDR_CID_HOST will use host->guest transport; + */ +int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) +{ + const struct vsock_transport *new_transport; + struct sock *sk = sk_vsock(vsk); + unsigned int remote_cid = vsk->remote_addr.svm_cid; + int ret; + + switch (sk->sk_type) { + case SOCK_DGRAM: + new_transport = transport_dgram; + break; + case SOCK_STREAM: + if (remote_cid <= VMADDR_CID_HOST || + (transport_g2h && + remote_cid == transport_g2h->get_local_cid())) + new_transport = transport_g2h; + else + new_transport = transport_h2g; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (vsk->transport) { + if (vsk->transport == new_transport) + return 0; + + vsk->transport->release(vsk); + vsock_deassign_transport(vsk); + } + + /* We increase the module refcnt to prevent the transport unloading + * while there are open sockets assigned to it. + */ + if (!new_transport || !try_module_get(new_transport->module)) + return -ENODEV; + + ret = new_transport->init(vsk, psk); + if (ret) { + module_put(new_transport->module); + return ret; + } + + vsk->transport = new_transport; + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_assign_transport); + +bool vsock_find_cid(unsigned int cid) +{ + if (transport_g2h && cid == transport_g2h->get_local_cid()) + return true; + + if (transport_h2g && cid == VMADDR_CID_HOST) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(vsock_find_cid); + static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; @@ -418,7 +504,12 @@ static bool vsock_is_pending(struct sock *sk) static int vsock_send_shutdown(struct sock *sk, int mode) { - return transport->shutdown(vsock_sk(sk), mode); + struct vsock_sock *vsk = vsock_sk(sk); + + if (!vsk->transport) + return -ENODEV; + + return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) @@ -439,7 +530,7 @@ static void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -528,13 +619,12 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - return transport->dgram_bind(vsk, addr); + return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); - u32 cid; int retval; /* First ensure this socket isn't already bound. */ @@ -544,10 +634,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most - * cases), we only allow binding to the local CID. + * cases), we only allow binding to a local CID. */ - cid = transport->get_local_cid(); - if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) + if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { @@ -571,12 +660,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) static void vsock_connect_timeout(struct work_struct *work); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, - unsigned short type, - int kern) +static struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; @@ -620,28 +709,24 @@ struct sock *__vsock_create(struct net *net, vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; + vsk->buffer_size = psk->buffer_size; + vsk->buffer_min_size = psk->buffer_min_size; + vsk->buffer_max_size = psk->buffer_max_size; } else { vsk->trusted = capable(CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; + vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; + vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; + vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } - if (transport->init(vsk, psk) < 0) { - sk_free(sk); - return NULL; - } - - if (sock) - vsock_insert_unbound(vsk); - return sk; } -EXPORT_SYMBOL_GPL(__vsock_create); static void __vsock_release(struct sock *sk, int level) { if (sk) { - struct sk_buff *skb; struct sock *pending; struct vsock_sock *vsk; @@ -651,7 +736,10 @@ static void __vsock_release(struct sock *sk, int level) /* The release call is supposed to use lock_sock_nested() * rather than lock_sock(), if a sock lock should be acquired. */ - transport->release(vsk); + if (vsk->transport) + vsk->transport->release(vsk); + else if (sk->sk_type == SOCK_STREAM) + vsock_remove_sock(vsk); /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking @@ -662,8 +750,7 @@ static void __vsock_release(struct sock *sk, int level) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; - while ((skb = skb_dequeue(&sk->sk_receive_queue))) - kfree_skb(skb); + skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { @@ -680,7 +767,7 @@ static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); - transport->destruct(vsk); + vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. @@ -702,15 +789,22 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; } +struct sock *vsock_create_connected(struct sock *parent) +{ + return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, + parent->sk_type, 0); +} +EXPORT_SYMBOL_GPL(vsock_create_connected); + s64 vsock_stream_has_data(struct vsock_sock *vsk) { - return transport->stream_has_data(vsk); + return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { - return transport->stream_has_space(vsk); + return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); @@ -879,6 +973,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock->type == SOCK_STREAM) { + const struct vsock_transport *transport = vsk->transport; lock_sock(sk); /* Listening sockets that have connections in their accept @@ -889,7 +984,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ - if (transport->stream_is_active(vsk) && + if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int ret = transport->notify_poll_in( @@ -954,6 +1049,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; + const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -962,6 +1058,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); @@ -1046,8 +1143,8 @@ static int vsock_dgram_connect(struct socket *sock, if (err) goto out; - if (!transport->dgram_allow(remote_addr->svm_cid, - remote_addr->svm_port)) { + if (!vsk->transport->dgram_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { err = -EINVAL; goto out; } @@ -1063,7 +1160,9 @@ out: static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); + struct vsock_sock *vsk = vsock_sk(sock->sk); + + return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } static const struct proto_ops vsock_dgram_ops = { @@ -1089,6 +1188,8 @@ static const struct proto_ops vsock_dgram_ops = { static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { + const struct vsock_transport *transport = vsk->transport; + if (!transport->cancel_pkt) return -EOPNOTSUPP; @@ -1125,6 +1226,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); @@ -1159,19 +1261,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, goto out; } + /* Set the remote address that we are connecting to. */ + memcpy(&vsk->remote_addr, remote_addr, + sizeof(vsk->remote_addr)); + + err = vsock_assign_transport(vsk, NULL); + if (err) + goto out; + + transport = vsk->transport; + /* The hypervisor and well-known contexts do not have socket * endpoints. */ - if (!transport->stream_allow(remote_addr->svm_cid, + if (!transport || + !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } - /* Set the remote address that we are connecting to. */ - memcpy(&vsk->remote_addr, remote_addr, - sizeof(vsk->remote_addr)); - err = vsock_auto_bind(vsk); if (err) goto out; @@ -1301,7 +1410,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, err = -listener->sk_err; if (connected) { - listener->sk_ack_backlog--; + sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); @@ -1366,6 +1475,23 @@ out: return err; } +static void vsock_update_buffer_size(struct vsock_sock *vsk, + const struct vsock_transport *transport, + u64 val) +{ + if (val > vsk->buffer_max_size) + val = vsk->buffer_max_size; + + if (val < vsk->buffer_min_size) + val = vsk->buffer_min_size; + + if (val != vsk->buffer_size && + transport && transport->notify_buffer_size) + transport->notify_buffer_size(vsk, &val); + + vsk->buffer_size = val; +} + static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, @@ -1375,6 +1501,7 @@ static int vsock_stream_setsockopt(struct socket *sock, int err; struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK) @@ -1395,23 +1522,26 @@ static int vsock_stream_setsockopt(struct socket *sock, err = 0; sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; lock_sock(sk); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); - transport->set_buffer_size(vsk, val); + vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); - transport->set_max_buffer_size(vsk, val); + vsk->buffer_max_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); - transport->set_min_buffer_size(vsk, val); + vsk->buffer_min_size = val; + vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT: { @@ -1478,17 +1608,17 @@ static int vsock_stream_getsockopt(struct socket *sock, switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: - val = transport->get_buffer_size(vsk); + val = vsk->buffer_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: - val = transport->get_max_buffer_size(vsk); + val = vsk->buffer_max_size; COPY_OUT(val); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: - val = transport->get_min_buffer_size(vsk); + val = vsk->buffer_min_size; COPY_OUT(val); break; @@ -1519,6 +1649,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; @@ -1527,6 +1658,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; total_written = 0; err = 0; @@ -1548,7 +1680,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != TCP_ESTABLISHED || + if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1658,6 +1790,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, { struct sock *sk; struct vsock_sock *vsk; + const struct vsock_transport *transport; int err; size_t target; ssize_t copied; @@ -1668,11 +1801,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sk = sock->sk; vsk = vsock_sk(sk); + transport = vsk->transport; err = 0; lock_sock(sk); - if (sk->sk_state != TCP_ESTABLISHED) { + if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the @@ -1846,6 +1980,10 @@ static const struct proto_ops vsock_stream_ops = { static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct vsock_sock *vsk; + struct sock *sk; + int ret; + if (!sock) return -EINVAL; @@ -1865,7 +2003,23 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; + sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); + if (!sk) + return -ENOMEM; + + vsk = vsock_sk(sk); + + if (sock->type == SOCK_DGRAM) { + ret = vsock_assign_transport(vsk, NULL); + if (ret < 0) { + sock_put(sk); + return ret; + } + } + + vsock_insert_unbound(vsk); + + return 0; } static const struct net_proto_family vsock_family_ops = { @@ -1878,11 +2032,20 @@ static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; + u32 cid = VMADDR_CID_ANY; int retval = 0; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: - if (put_user(transport->get_local_cid(), p) != 0) + /* To be compatible with the VMCI behavior, we prioritize the + * guest CID instead of well-know host CID (VMADDR_CID_HOST). + */ + if (transport_g2h) + cid = transport_g2h->get_local_cid(); + else if (transport_h2g) + cid = transport_h2g->get_local_cid(); + + if (put_user(cid, p) != 0) retval = -EFAULT; break; @@ -1922,24 +2085,13 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; -int __vsock_core_init(const struct vsock_transport *t, struct module *owner) +static int __init vsock_init(void) { - int err = mutex_lock_interruptible(&vsock_register_mutex); + int err = 0; - if (err) - return err; - - if (transport) { - err = -EBUSY; - goto err_busy; - } - - /* Transport must be the owner of the protocol so that it can't - * unload while there are open sockets. - */ - vsock_proto.owner = owner; - transport = t; + vsock_init_tables(); + vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { @@ -1960,7 +2112,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) goto err_unregister_proto; } - mutex_unlock(&vsock_register_mutex); return 0; err_unregister_proto: @@ -1968,44 +2119,86 @@ err_unregister_proto: err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: - transport = NULL; -err_busy: - mutex_unlock(&vsock_register_mutex); return err; } -EXPORT_SYMBOL_GPL(__vsock_core_init); -void vsock_core_exit(void) +static void __exit vsock_exit(void) { - mutex_lock(&vsock_register_mutex); - misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); - - /* We do not want the assignment below re-ordered. */ - mb(); - transport = NULL; - - mutex_unlock(&vsock_register_mutex); } -EXPORT_SYMBOL_GPL(vsock_core_exit); -const struct vsock_transport *vsock_core_get_transport(void) +const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { - /* vsock_register_mutex not taken since only the transport uses this - * function and only while registered. - */ - return transport; + return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); -static void __exit vsock_exit(void) +int vsock_core_register(const struct vsock_transport *t, int features) +{ + const struct vsock_transport *t_h2g, *t_g2h, *t_dgram; + int err = mutex_lock_interruptible(&vsock_register_mutex); + + if (err) + return err; + + t_h2g = transport_h2g; + t_g2h = transport_g2h; + t_dgram = transport_dgram; + + if (features & VSOCK_TRANSPORT_F_H2G) { + if (t_h2g) { + err = -EBUSY; + goto err_busy; + } + t_h2g = t; + } + + if (features & VSOCK_TRANSPORT_F_G2H) { + if (t_g2h) { + err = -EBUSY; + goto err_busy; + } + t_g2h = t; + } + + if (features & VSOCK_TRANSPORT_F_DGRAM) { + if (t_dgram) { + err = -EBUSY; + goto err_busy; + } + t_dgram = t; + } + + transport_h2g = t_h2g; + transport_g2h = t_g2h; + transport_dgram = t_dgram; + +err_busy: + mutex_unlock(&vsock_register_mutex); + return err; +} +EXPORT_SYMBOL_GPL(vsock_core_register); + +void vsock_core_unregister(const struct vsock_transport *t) { - /* Do nothing. This function makes this module removable. */ + mutex_lock(&vsock_register_mutex); + + if (transport_h2g == t) + transport_h2g = NULL; + + if (transport_g2h == t) + transport_g2h = NULL; + + if (transport_dgram == t) + transport_dgram = NULL; + + mutex_unlock(&vsock_register_mutex); } +EXPORT_SYMBOL_GPL(vsock_core_unregister); -module_init(vsock_init_tables); +module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index c443db7af8d4..3c7d07a99fc5 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,15 +13,16 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <asm/hyperv-tlfs.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some - * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer - * hosts don't have this limitation; but, keep the defaults the same for compat. + * stricter requirements on the hv_sock ring buffer size of six 4K pages. + * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this + * limitation; but, keep the defaults the same for compat. */ -#define PAGE_SIZE_4K 4096 -#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) -#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) +#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) +#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -54,7 +55,8 @@ struct hvs_recv_buf { * ringbuffer APIs that allow us to directly copy data from userspace buffer * to VMBus ringbuffer. */ -#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) +#define HVS_SEND_BUF_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) struct hvs_send_buf { /* The header before the payload data */ @@ -163,6 +165,8 @@ static const guid_t srv_id_template = GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); +static bool hvs_check_transport(struct vsock_sock *vsk); + static bool is_valid_srv_id(const guid_t *id) { return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4); @@ -186,7 +190,8 @@ static void hvs_remote_addr_init(struct sockaddr_vm *remote, static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; struct sock *sk; - vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY); + /* Remote peer is always the host */ + vsock_addr_init(remote, VMADDR_CID_HOST, VMADDR_PORT_ANY); while (1) { /* Wrap around ? */ @@ -358,13 +363,24 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) goto out; - new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + new = vsock_create_connected(sk); if (!new) goto out; new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); + + hvs_addr_init(&vnew->local_addr, if_type); + hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); + + ret = vsock_assign_transport(vnew, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the + * same where we received the request. + */ + if (ret || !hvs_check_transport(vnew)) { + sock_put(new); + goto out; + } hvs_new = vnew->trans; hvs_new->chan = chan; } else { @@ -393,10 +409,10 @@ static void hvs_open_connection(struct vmbus_channel *chan) } else { sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); - sndbuf = ALIGN(sndbuf, PAGE_SIZE); + sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); - rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); } ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, @@ -426,10 +442,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (conn_from_host) { new->sk_state = TCP_ESTABLISHED; - sk->sk_ack_backlog++; - - hvs_addr_init(&vnew->local_addr, if_type); - hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); + sk_acceptq_added(sk); hvs_new->vm_srv_id = *if_type; hvs_new->host_srv_id = *if_instance; @@ -670,7 +683,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, ssize_t ret = 0; ssize_t bytes_written = 0; - BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); + BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); if (!send_buf) @@ -843,37 +856,9 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, return 0; } -static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - /* Ignored. */ -} - -static u64 hvs_get_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - -static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk) -{ - return -ENOPROTOOPT; -} - static struct vsock_transport hvs_transport = { + .module = THIS_MODULE, + .get_local_cid = hvs_get_local_cid, .init = hvs_sock_init, @@ -906,14 +891,13 @@ static struct vsock_transport hvs_transport = { .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, .notify_send_post_enqueue = hvs_notify_send_post_enqueue, - .set_buffer_size = hvs_set_buffer_size, - .set_min_buffer_size = hvs_set_min_buffer_size, - .set_max_buffer_size = hvs_set_max_buffer_size, - .get_buffer_size = hvs_get_buffer_size, - .get_min_buffer_size = hvs_get_min_buffer_size, - .get_max_buffer_size = hvs_get_max_buffer_size, }; +static bool hvs_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &hvs_transport; +} + static int hvs_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { @@ -962,7 +946,7 @@ static int __init hvs_init(void) if (ret != 0) return ret; - ret = vsock_core_init(&hvs_transport); + ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H); if (ret) { vmbus_driver_unregister(&hvs_drv); return ret; @@ -973,7 +957,7 @@ static int __init hvs_init(void) static void __exit hvs_exit(void) { - vsock_core_exit(); + vsock_core_unregister(&hvs_transport); vmbus_driver_unregister(&hvs_drv); } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 082a30936690..1458c5c8b64d 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -86,33 +86,6 @@ out_rcu: return ret; } -static void virtio_transport_loopback_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, loopback_work); - LIST_HEAD(pkts); - - spin_lock_bh(&vsock->loopback_list_lock); - list_splice_init(&vsock->loopback_list, &pkts); - spin_unlock_bh(&vsock->loopback_list_lock); - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_recv_pkt(pkt); - } -out: - mutex_unlock(&vsock->rx_lock); -} - static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, struct virtio_vsock_pkt *pkt) { @@ -370,59 +343,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock) return val < virtqueue_get_vring_size(vq); } -static void virtio_transport_rx_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, rx_work); - struct virtqueue *vq; - - vq = vsock->vqs[VSOCK_VQ_RX]; - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - do { - virtqueue_disable_cb(vq); - for (;;) { - struct virtio_vsock_pkt *pkt; - unsigned int len; - - if (!virtio_transport_more_replies(vsock)) { - /* Stop rx until the device processes already - * pending replies. Leave rx virtqueue - * callbacks disabled. - */ - goto out; - } - - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { - break; - } - - vsock->rx_buf_nr--; - - /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); - continue; - } - - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(pkt); - } - } while (!virtqueue_enable_cb(vq)); - -out: - if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) - virtio_vsock_rx_fill(vsock); - mutex_unlock(&vsock->rx_lock); -} - /* event_lock must be held */ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, struct virtio_vsock_event *event) @@ -542,6 +462,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) static struct virtio_transport virtio_transport = { .transport = { + .module = THIS_MODULE, + .get_local_cid = virtio_transport_get_local_cid, .init = virtio_transport_do_socket_init, @@ -574,18 +496,92 @@ static struct virtio_transport virtio_transport = { .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, - - .set_buffer_size = virtio_transport_set_buffer_size, - .set_min_buffer_size = virtio_transport_set_min_buffer_size, - .set_max_buffer_size = virtio_transport_set_max_buffer_size, - .get_buffer_size = virtio_transport_get_buffer_size, - .get_min_buffer_size = virtio_transport_get_min_buffer_size, - .get_max_buffer_size = virtio_transport_get_max_buffer_size, + .notify_buffer_size = virtio_transport_notify_buffer_size, }, .send_pkt = virtio_transport_send_pkt, }; +static void virtio_transport_loopback_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, loopback_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->loopback_list_lock); + list_splice_init(&vsock->loopback_list, &pkts); + spin_unlock_bh(&vsock->loopback_list_lock); + + mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_recv_pkt(&virtio_transport, pkt); + } +out: + mutex_unlock(&vsock->rx_lock); +} + +static void virtio_transport_rx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, rx_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + + do { + virtqueue_disable_cb(vq); + for (;;) { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { + /* Stop rx until the device processes already + * pending replies. Leave rx virtqueue + * callbacks disabled. + */ + goto out; + } + + pkt = virtqueue_get_buf(vq, &len); + if (!pkt) { + break; + } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ + if (unlikely(len < sizeof(pkt->hdr) || + len > sizeof(pkt->hdr) + pkt->len)) { + virtio_transport_free_pkt(pkt); + continue; + } + + pkt->len = len - sizeof(pkt->hdr); + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&virtio_transport, pkt); + } + } while (!virtqueue_enable_cb(vq)); + +out: + if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); +} + static int virtio_vsock_probe(struct virtio_device *vdev) { vq_callback_t *callbacks[] = { @@ -776,7 +772,8 @@ static int __init virtio_vsock_init(void) if (!virtio_vsock_workqueue) return -ENOMEM; - ret = vsock_core_init(&virtio_transport.transport); + ret = vsock_core_register(&virtio_transport.transport, + VSOCK_TRANSPORT_F_G2H); if (ret) goto out_wq; @@ -787,7 +784,7 @@ static int __init virtio_vsock_init(void) return 0; out_vci: - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); out_wq: destroy_workqueue(virtio_vsock_workqueue); return ret; @@ -796,7 +793,7 @@ out_wq: static void __exit virtio_vsock_exit(void) { unregister_virtio_driver(&virtio_vsock_driver); - vsock_core_exit(); + vsock_core_unregister(&virtio_transport.transport); destroy_workqueue(virtio_vsock_workqueue); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index fb2060dffb0a..e5ea29c6bca7 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -29,9 +29,10 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 -static const struct virtio_transport *virtio_transport_get_ops(void) +static const struct virtio_transport * +virtio_transport_get_ops(struct vsock_sock *vsk) { - const struct vsock_transport *t = vsock_core_get_transport(); + const struct vsock_transport *t = vsock_core_get_transport(vsk); return container_of(t, struct virtio_transport, transport); } @@ -168,7 +169,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; - src_cid = vm_sockets_get_local_cid(); + src_cid = virtio_transport_get_ops(vsk)->transport.get_local_cid(); src_port = vsk->local_addr.svm_port; if (!info->remote_cid) { dst_cid = vsk->remote_addr.svm_cid; @@ -201,7 +202,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, pkt); - return virtio_transport_get_ops()->send_pkt(pkt); + return virtio_transport_get_ops(vsk)->send_pkt(pkt); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, @@ -268,6 +269,55 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, } static ssize_t +virtio_transport_stream_do_peek(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0, off; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + + list_for_each_entry(pkt, &vvs->rx_queue, list) { + off = pkt->off; + + if (total == len) + break; + + while (total < len && off < pkt->len) { + bytes = len - total; + if (bytes > pkt->len - off) + bytes = pkt->len - off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + off += bytes; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; + +out: + if (total) + err = total; + return err; +} + +static ssize_t virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) @@ -339,9 +389,9 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, size_t len, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_stream_do_dequeue(vsk, msg, len); + return virtio_transport_stream_do_peek(vsk, msg, len); + else + return virtio_transport_stream_do_dequeue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); @@ -403,20 +453,16 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, vsk->trans = vvs; vvs->vsk = vsk; - if (psk) { + if (psk && psk->trans) { struct virtio_vsock_sock *ptrans = psk->trans; - vvs->buf_size = ptrans->buf_size; - vvs->buf_size_min = ptrans->buf_size_min; - vvs->buf_size_max = ptrans->buf_size_max; vvs->peer_buf_alloc = ptrans->peer_buf_alloc; - } else { - vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; - vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; - vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; } - vvs->buf_alloc = vvs->buf_size; + if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE) + vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE; + + vvs->buf_alloc = vsk->buffer_size; spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); @@ -426,71 +472,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); - -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size_min; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); - -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +/* sk_lock held by the caller */ +void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) { struct virtio_vsock_sock *vvs = vsk->trans; - return vvs->buf_size_max; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE) + *val = VIRTIO_VSOCK_MAX_BUF_SIZE; -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size_min) - vvs->buf_size_min = val; - if (val > vvs->buf_size_max) - vvs->buf_size_max = val; - vvs->buf_size = val; - vvs->buf_alloc = val; + vvs->buf_alloc = *val; virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, NULL); } -EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); - -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val > vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_min = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); - -void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_vsock_sock *vvs = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < vvs->buf_size) - vvs->buf_size = val; - vvs->buf_size_max = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); +EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, @@ -582,9 +577,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - - return vvs->buf_size; + return vsk->buffer_size; } EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); @@ -696,9 +689,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk, /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. */ -static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +static int virtio_transport_reset_no_sock(const struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { - const struct virtio_transport *t; struct virtio_vsock_pkt *reply; struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, @@ -718,7 +711,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) if (!reply) return -ENOMEM; - t = virtio_transport_get_ops(); if (!t) { virtio_transport_free_pkt(reply); return -ENOTCONN; @@ -994,13 +986,39 @@ virtio_transport_send_response(struct vsock_sock *vsk, return virtio_transport_send_pkt_info(vsk, &info); } +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* Listener sockets are not associated with any transport, so we are + * not able to take the state to see if there is space available in the + * remote peer, but since they are only used to receive requests, we + * can assume that there is always space available in the other peer. + */ + if (!vvs) + return true; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, + struct virtio_transport *t) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; + int ret; if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { virtio_transport_reset(vsk, pkt); @@ -1012,14 +1030,13 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return -ENOMEM; } - child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + child = vsock_create_connected(sk); if (!child) { virtio_transport_reset(vsk, pkt); return -ENOMEM; } - sk->sk_ack_backlog++; + sk_acceptq_added(sk); lock_sock_nested(child, SINGLE_DEPTH_NESTING); @@ -1031,6 +1048,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); + ret = vsock_assign_transport(vchild, vsk); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (ret || vchild->transport != &t->transport) { + release_sock(child); + virtio_transport_reset(vsk, pkt); + sock_put(child); + return ret; + } + + if (virtio_transport_space_update(child, pkt)) + child->sk_write_space(child); + vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); virtio_transport_send_response(vchild, pkt); @@ -1041,26 +1072,11 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) return 0; } -static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_vsock_sock *vvs = vsk->trans; - bool space_available; - - /* buf_alloc and fwd_cnt is always included in the hdr */ - spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); - space_available = virtio_transport_has_space(vsk); - spin_unlock_bh(&vvs->tx_lock); - return space_available; -} - /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt) { struct sockaddr_vm src, dst; struct vsock_sock *vsk; @@ -1082,7 +1098,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) le32_to_cpu(pkt->hdr.fwd_cnt)); if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1093,7 +1109,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(pkt); + (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } } @@ -1112,7 +1128,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt); + virtio_transport_recv_listen(sk, pkt, t); virtio_transport_free_pkt(pkt); break; case TCP_SYN_SENT: @@ -1130,6 +1146,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) virtio_transport_free_pkt(pkt); break; } + release_sock(sk); /* Release refcnt obtained when we fetched this socket out of the diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8c9c4ed90fa7..644d32e43d23 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto); static u16 vmci_transport_new_proto_supported_versions(void); static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto, bool old_pkt_proto); +static bool vmci_check_transport(struct vsock_sock *vsk); struct vmci_transport_recv_pkt_info { struct work_struct work; @@ -74,15 +75,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; static int PROTOCOL_OVERRIDE = -1; -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144 -#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144 - -/* The default peer timeout indicates how long we will wait for a peer response - * to a control message. - */ -#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) - /* Helper function to convert from a VMCI error code to a VSock error code. */ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) @@ -1013,8 +1005,7 @@ static int vmci_transport_recv_listen(struct sock *sk, return -ECONNREFUSED; } - pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); + pending = vsock_create_connected(sk); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; @@ -1027,14 +1018,24 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context, pkt->src_port); + err = vsock_assign_transport(vpending, vsock_sk(sk)); + /* Transport assigned (looking at remote_addr) must be the same + * where we received the request. + */ + if (err || !vmci_check_transport(vpending)) { + vmci_transport_send_reset(sk, pkt); + sock_put(pending); + return err; + } + /* If the proposed size fits within our min/max, accept it. Otherwise * propose our own size. */ - if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size && - pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) { + if (pkt->u.size >= vpending->buffer_min_size && + pkt->u.size <= vpending->buffer_max_size) { qp_size = pkt->u.size; } else { - qp_size = vmci_trans(vpending)->queue_pair_size; + qp_size = vpending->buffer_size; } /* Figure out if we are using old or new requests based on the @@ -1098,12 +1099,12 @@ static int vmci_transport_recv_listen(struct sock *sk, } vsock_add_pending(sk, pending); - sk->sk_ack_backlog++; + sk_acceptq_added(sk); pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; - vmci_trans(vpending)->queue_pair_size = qp_size; + vpending->buffer_size = qp_size; vmci_trans(vpending)->notify_ops->process_request(pending); @@ -1397,8 +1398,8 @@ static int vmci_transport_recv_connecting_client_negotiate( vsk->ignore_connecting_rst = false; /* Verify that we're OK with the proposed queue pair size */ - if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size || - pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) { + if (pkt->u.size < vsk->buffer_min_size || + pkt->u.size > vsk->buffer_max_size) { err = -EINVAL; goto destroy; } @@ -1503,8 +1504,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk, vsk->sent_request = false; vsk->ignore_connecting_rst = true; - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) err = vmci_transport_error_to_vsock_error(err); else @@ -1588,21 +1588,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, INIT_LIST_HEAD(&vmci_trans(vsk)->elem); vmci_trans(vsk)->sk = &vsk->sk; spin_lock_init(&vmci_trans(vsk)->lock); - if (psk) { - vmci_trans(vsk)->queue_pair_size = - vmci_trans(psk)->queue_pair_size; - vmci_trans(vsk)->queue_pair_min_size = - vmci_trans(psk)->queue_pair_min_size; - vmci_trans(vsk)->queue_pair_max_size = - vmci_trans(psk)->queue_pair_max_size; - } else { - vmci_trans(vsk)->queue_pair_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE; - vmci_trans(vsk)->queue_pair_min_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN; - vmci_trans(vsk)->queue_pair_max_size = - VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX; - } return 0; } @@ -1818,8 +1803,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) if (vmci_transport_old_proto_override(&old_pkt_proto) && old_pkt_proto) { - err = vmci_transport_send_conn_request( - sk, vmci_trans(vsk)->queue_pair_size); + err = vmci_transport_send_conn_request(sk, vsk->buffer_size); if (err < 0) { sk->sk_state = TCP_CLOSE; return err; @@ -1827,8 +1811,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) } else { int supported_proto_versions = vmci_transport_new_proto_supported_versions(); - err = vmci_transport_send_conn_request2( - sk, vmci_trans(vsk)->queue_pair_size, + err = vmci_transport_send_conn_request2(sk, vsk->buffer_size, supported_proto_versions); if (err < 0) { sk->sk_state = TCP_CLOSE; @@ -1881,46 +1864,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk) return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle); } -static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_size; -} - -static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_min_size; -} - -static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk) -{ - return vmci_trans(vsk)->queue_pair_max_size; -} - -static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_min_size) - vmci_trans(vsk)->queue_pair_min_size = val; - if (val > vmci_trans(vsk)->queue_pair_max_size) - vmci_trans(vsk)->queue_pair_max_size = val; - vmci_trans(vsk)->queue_pair_size = val; -} - -static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val > vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_min_size = val; -} - -static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk, - u64 val) -{ - if (val < vmci_trans(vsk)->queue_pair_size) - vmci_trans(vsk)->queue_pair_size = val; - vmci_trans(vsk)->queue_pair_max_size = val; -} - static int vmci_transport_notify_poll_in( struct vsock_sock *vsk, size_t target, @@ -2076,7 +2019,8 @@ static u32 vmci_transport_get_local_cid(void) return vmci_get_context_id(); } -static const struct vsock_transport vmci_transport = { +static struct vsock_transport vmci_transport = { + .module = THIS_MODULE, .init = vmci_transport_socket_init, .destruct = vmci_transport_destruct, .release = vmci_transport_release, @@ -2103,15 +2047,26 @@ static const struct vsock_transport vmci_transport = { .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue, .shutdown = vmci_transport_shutdown, - .set_buffer_size = vmci_transport_set_buffer_size, - .set_min_buffer_size = vmci_transport_set_min_buffer_size, - .set_max_buffer_size = vmci_transport_set_max_buffer_size, - .get_buffer_size = vmci_transport_get_buffer_size, - .get_min_buffer_size = vmci_transport_get_min_buffer_size, - .get_max_buffer_size = vmci_transport_get_max_buffer_size, .get_local_cid = vmci_transport_get_local_cid, }; +static bool vmci_check_transport(struct vsock_sock *vsk) +{ + return vsk->transport == &vmci_transport; +} + +void vmci_vsock_transport_cb(bool is_host) +{ + int features; + + if (is_host) + features = VSOCK_TRANSPORT_F_H2G; + else + features = VSOCK_TRANSPORT_F_G2H; + + vsock_core_register(&vmci_transport, features); +} + static int __init vmci_transport_init(void) { int err; @@ -2128,7 +2083,6 @@ static int __init vmci_transport_init(void) pr_err("Unable to create datagram handle. (%d)\n", err); return vmci_transport_error_to_vsock_error(err); } - err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED, vmci_transport_qp_resumed_cb, NULL, &vmci_transport_qp_resumed_sub_id); @@ -2139,12 +2093,21 @@ static int __init vmci_transport_init(void) goto err_destroy_stream_handle; } - err = vsock_core_init(&vmci_transport); + /* Register only with dgram feature, other features (H2G, G2H) will be + * registered when the first host or guest becomes active. + */ + err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM); if (err < 0) goto err_unsubscribe; + err = vmci_register_vsock_callback(vmci_vsock_transport_cb); + if (err < 0) + goto err_unregister; + return 0; +err_unregister: + vsock_core_unregister(&vmci_transport); err_unsubscribe: vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); err_destroy_stream_handle: @@ -2170,7 +2133,8 @@ static void __exit vmci_transport_exit(void) vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; } - vsock_core_exit(); + vmci_register_vsock_callback(NULL); + vsock_core_unregister(&vmci_transport); } module_exit(vmci_transport_exit); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index 1ca1e8640b31..b7b072194282 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -108,9 +108,6 @@ struct vmci_transport { struct vmci_qp *qpair; u64 produce_size; u64 consume_size; - u64 queue_pair_size; - u64 queue_pair_min_size; - u64 queue_pair_max_size; u32 detach_sub_id; union vmci_transport_notify notify; const struct vmci_transport_notify_ops *notify_ops; diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h index 7843f08d4290..a1aa5a998c0e 100644 --- a/net/vmw_vsock/vmci_transport_notify.h +++ b/net/vmw_vsock/vmci_transport_notify.h @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> -#include <linux/vm_sockets.h> #include "vmci_transport.h" diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b72286922f7..da5262b2298b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -624,6 +624,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = SAE_PASSWORD_MAX_LEN }, [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), + [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), }; /* policy for the key attributes */ @@ -3940,6 +3941,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) key.type != NL80211_KEYTYPE_GROUP) return -EINVAL; + if (key.type == NL80211_KEYTYPE_GROUP && + info->attrs[NL80211_ATTR_VLAN_ID]) + key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (!rdev->ops->add_key) return -EOPNOTSUPP; @@ -5711,6 +5716,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_STA_AID]) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); @@ -5856,6 +5864,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_VLAN_ID]) + params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]); + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { params.support_p2p_ps = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); @@ -8265,10 +8276,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, /* leave request id zero for legacy request * or if driver does not support multi-scheduled scan */ - if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { - while (!sched_scan_req->reqid) - sched_scan_req->reqid = cfg80211_assign_cookie(rdev); - } + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) + sched_scan_req->reqid = cfg80211_assign_cookie(rdev); err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) diff --git a/net/wireless/reg.h b/net/wireless/reg.h index dc8f689bd469..f9e83031a40a 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, u8 country_ie_len); /** - * regulatory_hint_disconnect - informs all devices have been disconneted + * regulatory_hint_disconnect - informs all devices have been disconnected * * Regulotory rules can be enhanced further upon scanning and upon * connection to an AP. These rules become stale if we disconnect diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6aee9f5e8e71..c34f7d077604 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags, /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); - sk->sk_ack_backlog--; + sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: @@ -1062,7 +1062,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; - sk->sk_ack_backlog++; + sk_acceptq_added(sk); x25_insert_socket(make); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9044073fbf22..956793893c9d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -196,7 +196,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len; @@ -212,7 +212,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); } -void xsk_flush(struct xdp_sock *xs) +static void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); xs->sk.sk_data_ready(&xs->sk); @@ -264,6 +264,35 @@ out_unlock: return err; } +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del_clearprev(&xs->flush_node); + } +} + void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { xskq_produce_flush_addr_n(umem->cq, nb_entries); @@ -418,10 +447,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return __xsk_sendmsg(sk); } -static unsigned int xsk_poll(struct file *file, struct socket *sock, +static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; @@ -443,9 +472,9 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, } if (xs->rx && !xskq_empty_desc(xs->rx)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && !xskq_full_desc(xs->tx)) - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; return mask; } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 3981bc0d9e6c..6921a18201a0 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -3,13 +3,13 @@ # XFRM configuration # config XFRM - bool - depends on INET - select GRO_CELLS - select SKB_EXTENSIONS + bool + depends on INET + select GRO_CELLS + select SKB_EXTENSIONS config XFRM_OFFLOAD - bool + bool config XFRM_ALGO tristate diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 0f5131bc3342..7ac1542feaf8 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -732,30 +732,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) -{ - struct xfrm_if *xi; - LIST_HEAD(list); - - xi = rtnl_dereference(xfrmn->xfrmi[0]); - if (!xi) - return; - - unregister_netdevice_queue(xi->dev, &list); - unregister_netdevice_many(&list); -} - -static void __net_exit xfrmi_exit_net(struct net *net) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - rtnl_lock(); - xfrmi_destroy_interfaces(xfrmn); - rtnl_unlock(); -} - static struct pernet_operations xfrmi_net_ops = { - .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; |