summaryrefslogtreecommitdiffstats
path: root/net/smc/smc_ib.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 14:20:58 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 14:20:58 +0100
commitfc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4 (patch)
tree93b16bc48fdc3be4a1adccbf4c7de92a5e8440e1 /net/smc/smc_ib.c
parentMerge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert... (diff)
parentRevert "net: avoid double accounting for pure zerocopy skbs" (diff)
downloadlinux-fc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4.tar.xz
linux-fc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4.zip
Merge tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Remove socket skb caches - Add a SO_RESERVE_MEM socket op to forward allocate buffer space and avoid memory accounting overhead on each message sent - Introduce managed neighbor entries - added by control plane and resolved by the kernel for use in acceleration paths (BPF / XDP right now, HW offload users will benefit as well) - Make neighbor eviction on link down controllable by userspace to work around WiFi networks with bad roaming implementations - vrf: Rework interaction with netfilter/conntrack - fq_codel: implement L4S style ce_threshold_ect1 marking - sch: Eliminate unnecessary RCU waits in mini_qdisc_pair_swap() BPF: - Add support for new btf kind BTF_KIND_TAG, arbitrary type tagging as implemented in LLVM14 - Introduce bpf_get_branch_snapshot() to capture Last Branch Records - Implement variadic trace_printk helper - Add a new Bloomfilter map type - Track <8-byte scalar spill and refill - Access hw timestamp through BPF's __sk_buff - Disallow unprivileged BPF by default - Document BPF licensing Netfilter: - Introduce egress hook for looking at raw outgoing packets - Allow matching on and modifying inner headers / payload data - Add NFT_META_IFTYPE to match on the interface type either from ingress or egress Protocols: - Multi-Path TCP: - increase default max additional subflows to 2 - rework forward memory allocation - add getsockopts: MPTCP_INFO, MPTCP_TCPINFO, MPTCP_SUBFLOW_ADDRS - MCTP flow support allowing lower layer drivers to configure msg muxing as needed - Automatic Multicast Tunneling (AMT) driver based on RFC7450 - HSR support the redbox supervision frames (IEC-62439-3:2018) - Support for the ip6ip6 encapsulation of IOAM - Netlink interface for CAN-FD's Transmitter Delay Compensation - Support SMC-Rv2 eliminating the current same-subnet restriction, by exploiting the UDP encapsulation feature of RoCE adapters - TLS: add SM4 GCM/CCM crypto support - Bluetooth: initial support for link quality and audio/codec offload Driver APIs: - Add a batched interface for RX buffer allocation in AF_XDP buffer pool - ethtool: Add ability to control transceiver modules' power mode - phy: Introduce supported interfaces bitmap to express MAC capabilities and simplify PHY code - Drop rtnl_lock from DSA .port_fdb_{add,del} callbacks New drivers: - WiFi driver for Realtek 8852AE 802.11ax devices (rtw89) - Ethernet driver for ASIX AX88796C SPI device (x88796c) Drivers: - Broadcom PHYs - support 72165, 7712 16nm PHYs - support IDDQ-SR for additional power savings - PHY support for QCA8081, QCA9561 PHYs - NXP DPAA2: support for IRQ coalescing - NXP Ethernet (enetc): support for software TCP segmentation - Renesas Ethernet (ravb) - support DMAC and EMAC blocks of Gigabit-capable IP found on RZ/G2L SoC - Intel 100G Ethernet - support for eswitch offload of TC/OvS flow API, including offload of GRE, VxLAN, Geneve tunneling - support application device queues - ability to assign Rx and Tx queues to application threads - PTP and PPS (pulse-per-second) extensions - Broadcom Ethernet (bnxt) - devlink health reporting and device reload extensions - Mellanox Ethernet (mlx5) - offload macvlan interfaces - support HW offload of TC rules involving OVS internal ports - support HW-GRO and header/data split - support application device queues - Marvell OcteonTx2: - add XDP support for PF - add PTP support for VF - Qualcomm Ethernet switch (qca8k): support for QCA8328 - Realtek Ethernet DSA switch (rtl8366rb) - support bridge offload - support STP, fast aging, disabling address learning - support for Realtek RTL8365MB-VC, a 4+1 port 10M/100M/1GE switch - Mellanox Ethernet/IB switch (mlxsw) - multi-level qdisc hierarchy offload (e.g. RED, prio and shaping) - offload root TBF qdisc as port shaper - support multiple routing interface MAC address prefixes - support for IP-in-IP with IPv6 underlay - MediaTek WiFi (mt76) - mt7921 - ASPM, 6GHz, SDIO and testmode support - mt7915 - LED and TWT support - Qualcomm WiFi (ath11k) - include channel rx and tx time in survey dump statistics - support for 80P80 and 160 MHz bandwidths - support channel 2 in 6 GHz band - spectral scan support for QCN9074 - support for rx decapsulation offload (data frames in 802.3 format) - Qualcomm phone SoC WiFi (wcn36xx) - enable Idle Mode Power Save (IMPS) to reduce power consumption during idle - Bluetooth driver support for MediaTek MT7922 and MT7921 - Enable support for AOSP Bluetooth extension in Qualcomm WCN399x and Realtek 8822C/8852A - Microsoft vNIC driver (mana) - support hibernation and kexec - Google vNIC driver (gve) - support for jumbo frames - implement Rx page reuse Refactor: - Make all writes to netdev->dev_addr go thru helpers, so that we can add this address to the address rbtree and handle the updates - Various TCP cleanups and optimizations including improvements to CPU cache use - Simplify the gnet_stats, Qdisc stats' handling and remove qdisc->running sequence counter - Driver changes and API updates to address devlink locking deficiencies" * tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2122 commits) Revert "net: avoid double accounting for pure zerocopy skbs" selftests: net: add arp_ndisc_evict_nocarrier net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter net: arp: introduce arp_evict_nocarrier sysctl parameter libbpf: Deprecate AF_XDP support kbuild: Unify options for BTF generation for vmlinux and modules selftests/bpf: Add a testcase for 64-bit bounds propagation issue. bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit. bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off. net: vmxnet3: remove multiple false checks in vmxnet3_ethtool.c net: avoid double accounting for pure zerocopy skbs tcp: rename sk_wmem_free_skb netdevsim: fix uninit value in nsim_drv_configure_vfs() selftests/bpf: Fix also no-alu32 strobemeta selftest bpf: Add missing map_delete_elem method to bloom filter map selftests/bpf: Add bloom map success test for userspace calls bpf: Add alignment padding for "map_extra" + consolidate holes bpf: Bloom filter map naming fixups selftests/bpf: Add test cases for struct_ops prog bpf: Add dummy BPF STRUCT_OPS for test purpose ...
Diffstat (limited to 'net/smc/smc_ib.c')
-rw-r--r--net/smc/smc_ib.c160
1 files changed, 146 insertions, 14 deletions
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a8845343d183..d93055ec17ae 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -17,6 +17,7 @@
#include <linux/scatterlist.h>
#include <linux/wait.h>
#include <linux/mutex.h>
+#include <linux/inetdevice.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
@@ -62,16 +63,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
struct ib_qp_attr qp_attr;
+ u8 hop_lim = 1;
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = IB_QPS_RTR;
qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
- rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ hop_lim = IPV6_DEFAULT_HOPLIMIT;
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
- memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
- sizeof(lnk->peer_mac));
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
+ sizeof(lnk->lgr->nexthop_mac));
+ else
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
qp_attr.dest_qp_num = lnk->peer_qpn;
qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
@@ -183,9 +191,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
}
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+ u8 nexthop_mac[], u8 *uses_gateway)
+{
+ struct neighbour *neigh = NULL;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4 = {
+ .saddr = saddr,
+ .daddr = daddr
+ };
+
+ if (daddr == cpu_to_be32(INADDR_NONE))
+ goto out;
+ rt = ip_route_output_flow(&init_net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out;
+ if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
+ goto out;
+ neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
+ if (neigh) {
+ memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
+ *uses_gateway = rt->rt_uses_gateway;
+ return 0;
+ }
+out:
+ return -ENOENT;
+}
+
+static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
+ const struct ib_gid_attr *attr,
+ u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
+{
+ if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+ if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
+ struct in_device *in_dev = __in_dev_get_rcu(ndev);
+ const struct in_ifaddr *ifa;
+ bool subnet_match = false;
+
+ if (!in_dev)
+ goto out;
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!inet_ifa_match(smcrv2->saddr, ifa))
+ continue;
+ subnet_match = true;
+ break;
+ }
+ if (!subnet_match)
+ goto out;
+ if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
+ smcrv2->daddr,
+ smcrv2->nexthop_mac,
+ &smcrv2->uses_gateway))
+ goto out;
+
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+out:
+ return -ENODEV;
+}
+
/* determine the gid for an ib-device port and vlan id */
int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
- unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
{
const struct ib_gid_attr *attr;
const struct net_device *ndev;
@@ -201,15 +281,13 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
if (!IS_ERR(ndev) &&
((!vlan_id && !is_vlan_dev(ndev)) ||
(vlan_id && is_vlan_dev(ndev) &&
- vlan_dev_vlan_id(ndev) == vlan_id)) &&
- attr->gid_type == IB_GID_TYPE_ROCE) {
- rcu_read_unlock();
- if (gid)
- memcpy(gid, &attr->gid, SMC_GID_SIZE);
- if (sgid_index)
- *sgid_index = attr->index;
- rdma_put_gid_attr(attr);
- return 0;
+ vlan_dev_vlan_id(ndev) == vlan_id))) {
+ if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
+ sgid_index, smcrv2)) {
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ return 0;
+ }
}
rcu_read_unlock();
rdma_put_gid_attr(attr);
@@ -217,6 +295,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
return -ENODEV;
}
+/* check if gid is still defined on smcibdev */
+static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
+ struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ bool rc = false;
+ int i;
+
+ for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ rcu_read_lock();
+ if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
+ (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
+ & IPV6_ADDR_LINKLOCAL)))
+ if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
+ rc = true;
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ }
+ return rc;
+}
+
+/* check all links if the gid is still defined on smcibdev */
+static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr;
+ int i;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
+ lgr->smc_version == SMC_V2,
+ smcibdev, ibport))
+ smcr_port_err(smcibdev, ibport);
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
{
int rc;
@@ -255,6 +385,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
} else {
clear_bit(port_idx, smcibdev->ports_going_away);
smcr_port_add(smcibdev, port_idx + 1);
+ smc_ib_gid_check(smcibdev, port_idx + 1);
}
}
}
@@ -523,6 +654,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
/* create a queue pair within the protection domain for a link */
int smc_ib_create_queue_pair(struct smc_link *lnk)
{
+ int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
struct ib_qp_init_attr qp_attr = {
.event_handler = smc_ib_qp_event_handler,
.qp_context = lnk,
@@ -536,7 +668,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.max_send_wr = SMC_WR_BUF_CNT * 3,
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
- .max_recv_sge = 1,
+ .max_recv_sge = sges_per_buf,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_RC,