diff options
Diffstat (limited to 'net')
157 files changed, 8440 insertions, 2522 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 4cdf8416869d..55275ef9a31a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -67,7 +67,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; - array = kzalloc(size, GFP_KERNEL); + array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a0367b37512d..0c21d1fec852 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -372,8 +372,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: case SIOCSMIIREG: case SIOCGHWTSTAMP: - if (netif_device_present(real_dev) && ops->ndo_do_ioctl) - err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); + if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) + err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } @@ -814,7 +814,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, - .ndo_do_ioctl = vlan_dev_ioctl, + .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) diff --git a/net/Kconfig b/net/Kconfig index c7392c449b25..fb13460c6dab 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -363,6 +363,7 @@ source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" source "net/strparser/Kconfig" +source "net/mctp/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9ca9572188fe..fbfeb8a0bb37 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ +obj-$(CONFIG_MCTP) += mctp/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 8ade5a4ceaf5..bf5736c1d458 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -666,7 +666,7 @@ static int atif_ioctl(int cmd, void __user *arg) struct rtentry rtdef; int add_route; - if (copy_from_user(&atreq, arg, sizeof(atreq))) + if (get_user_ifreq(&atreq, NULL, arg)) return -EFAULT; dev = __dev_get_by_name(&init_net, atreq.ifr_name); @@ -865,7 +865,7 @@ static int atif_ioctl(int cmd, void __user *arg) return 0; } - return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; + return put_user_ifreq(&atreq, arg); } static int atrtr_ioctl_addrt(struct rtentry *rt) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1cc75c811e24..b488e2779718 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,6 +15,7 @@ #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> +#include <net/xdp.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> @@ -687,6 +688,64 @@ out: return ret; } +static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) +{ + unsigned int ingress_ifindex, rx_queue_index; + struct netdev_rx_queue *rxqueue; + struct net_device *device; + + if (!xdp_md) + return 0; + + if (xdp_md->egress_ifindex != 0) + return -EINVAL; + + ingress_ifindex = xdp_md->ingress_ifindex; + rx_queue_index = xdp_md->rx_queue_index; + + if (!ingress_ifindex && rx_queue_index) + return -EINVAL; + + if (ingress_ifindex) { + device = dev_get_by_index(current->nsproxy->net_ns, + ingress_ifindex); + if (!device) + return -ENODEV; + + if (rx_queue_index >= device->real_num_rx_queues) + goto free_dev; + + rxqueue = __netif_get_rx_queue(device, rx_queue_index); + + if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) + goto free_dev; + + xdp->rxq = &rxqueue->xdp_rxq; + /* The device is now tracked in the xdp->rxq for later + * dev_put() + */ + } + + xdp->data = xdp->data_meta + xdp_md->data; + return 0; + +free_dev: + dev_put(device); + return -EINVAL; +} + +static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) +{ + if (!xdp_md) + return; + + xdp_md->data = xdp->data - xdp->data_meta; + xdp_md->data_end = xdp->data_end - xdp->data_meta; + + if (xdp_md->ingress_ifindex) + dev_put(xdp->rxq->dev); +} + int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -697,38 +756,74 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + struct xdp_md *ctx; u32 max_data_sz; void *data; - int ret; + int ret = -EINVAL; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; + ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + /* There can't be user provided data before the meta data */ + if (ctx->data_meta || ctx->data_end != size || + ctx->data > ctx->data_end || + unlikely(xdp_metalen_invalid(ctx->data))) + goto free_ctx; + /* Meta data is allocated from the headroom */ + headroom -= ctx->data; + } /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto free_ctx; + } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); + ret = xdp_convert_md_to_buff(ctx, &xdp); + if (ret) + goto free_data; + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); + /* We convert the xdp_buff back to an xdp_md before checking the return + * code so the reference count of any held netdevice will be decremented + * even if the test run failed. + */ + xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; - if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) - size = xdp.data_end - xdp.data; - ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); + + if (xdp.data_meta != data + headroom || + xdp.data_end != xdp.data_meta + size) + size = xdp.data_end - xdp.data_meta; + + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval, + duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct xdp_md)); + out: bpf_prog_change_xdp(prog, NULL); +free_data: kfree(data); +free_ctx: + kfree(ctx); return ret; } diff --git a/net/bridge/br.c b/net/bridge/br.c index ef743f94254d..8fb5dca5f8e0 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -214,17 +214,22 @@ static struct notifier_block br_switchdev_notifier = { int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { + int err = 0; + switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + err = br_multicast_toggle_vlan_snooping(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } - return 0; + return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) @@ -232,6 +237,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -352,7 +359,7 @@ static int __init br_init(void) if (err) goto err_out5; - brioctl_set(br_ioctl_deviceless_stub); + brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e8b626cc6bfd..8d6bab244c4a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -27,11 +27,14 @@ EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); + struct net_bridge_mcast *brmctx = &br->multicast_ctx; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; + struct net_bridge_vlan *vlan; const unsigned char *dest; u16 vid = 0; @@ -53,7 +56,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, + &state, &vlan)) goto out; if (IS_ENABLED(CONFIG_INET) && @@ -82,15 +86,15 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_flood(br, skb, BR_PKT_MULTICAST, false, true); goto out; } - if (br_multicast_rcv(br, NULL, skb, vid)) { + if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) { kfree_skb(skb); goto out; } - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) - br_multicast_flood(mdst, skb, false, true); + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) + br_multicast_flood(mdst, skb, brmctx, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { @@ -450,7 +454,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, .ndo_change_mtu = br_change_mtu, - .ndo_do_ioctl = br_dev_ioctl, + .ndo_siocdevprivate = br_dev_siocdevprivate, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = br_netpoll_setup, .ndo_netpoll_cleanup = br_netpoll_cleanup, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index a16191dcaed1..4ff8c67ac88f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -732,11 +732,11 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } -static int br_fdb_replay_one(struct notifier_block *nb, +static int br_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, - struct net_device *dev, unsigned long action, - const void *ctx) + unsigned long action, const void *ctx) { + const struct net_bridge_port *p = READ_ONCE(fdb->dst); struct switchdev_notifier_fdb_info item; int err; @@ -745,25 +745,25 @@ static int br_fdb_replay_one(struct notifier_block *nb, item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); - item.info.dev = dev; + item.info.dev = item.is_local ? br->dev : p->dev; item.info.ctx = ctx; err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } -int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb) +int br_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, + struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; unsigned long action; int err = 0; - if (!netif_is_bridge_master(br_dev)) - return -EINVAL; + if (!nb) + return 0; - if (!netif_is_bridge_port(dev) && !netif_is_bridge_master(dev)) + if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); @@ -776,14 +776,7 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { - const struct net_bridge_port *dst = READ_ONCE(fdb->dst); - struct net_device *dst_dev; - - dst_dev = dst ? dst->dev : br->dev; - if (dst_dev && dst_dev != dev) - continue; - - err = br_fdb_replay_one(nb, fdb, dst_dev, action, ctx); + err = br_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } @@ -792,7 +785,6 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(br_fdb_replay); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 07856362538f..ec646656dbf1 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -48,6 +48,8 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_set_network_header(skb, depth); } + br_switchdev_frame_set_offload_fwd_mark(skb); + dev_queue_xmit(skb); return 0; @@ -76,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to, struct net *net; int br_hook; + /* Mark the skb for forwarding offload early so that br_handle_vlan() + * can know whether to pop the VLAN header on egress or keep it. + */ + nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) @@ -174,6 +181,8 @@ static struct net_bridge_port *maybe_deliver( if (!should_deliver(p, skb)) return prev; + nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); + if (!prev) goto out; @@ -267,20 +276,19 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; struct hlist_node *rp; - rp = br_multicast_get_first_rport_node(br, skb); + rp = br_multicast_get_first_rport_node(brmctx, skb); if (mdst) { p = rcu_dereference(mdst->ports); - if (br_multicast_should_handle_mode(br, mdst->addr.proto) && + if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) && br_multicast_is_star_g(&mdst->addr)) allow_mode_include = false; } else { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 6e4a32354a13..86f6d7e93ea8 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -643,10 +643,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (err) goto err5; - err = nbp_switchdev_mark_set(p); - if (err) - goto err6; - dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -684,13 +680,13 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) - goto err7; + goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err7; + goto err6; } spin_lock_bh(&br->lock); @@ -713,13 +709,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, return 0; -err7: +err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); -err6: netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 1f506309efa8..8a0c0cc55cb4 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -69,8 +69,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; + struct net_bridge_mcast *brmctx; + struct net_bridge_vlan *vlan; struct net_bridge *br; u16 vid = 0; u8 state; @@ -78,9 +81,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; + brmctx = &p->br->multicast_ctx; + pmctx = &p->multicast_ctx; state = p->state; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, - &state)) + &state, &vlan)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -98,7 +103,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; - if (br_multicast_rcv(br, p, skb, vid)) + if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } @@ -128,11 +133,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb switch (pkt_type) { case BR_PKT_MULTICAST: - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(br, skb)) { + br_multicast_is_router(brmctx, skb)) { local_rcv = true; br->dev->stats.multicast++; } @@ -162,7 +167,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false); else - br_multicast_flood(mdst, skb, local_rcv, false); + br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 2db800fc27ca..46a24c20e405 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -106,15 +106,32 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ -static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; + void __user *argp; int ret = -EOPNOTSUPP; - if (copy_from_user(args, rq->ifr_data, sizeof(args))) - return -EFAULT; + if (in_compat_syscall()) { + unsigned int cargs[4]; + + if (copy_from_user(cargs, data, sizeof(cargs))) + return -EFAULT; + + args[0] = cargs[0]; + args[1] = cargs[1]; + args[2] = cargs[2]; + args[3] = cargs[3]; + + argp = compat_ptr(args[1]); + } else { + if (copy_from_user(args, data, sizeof(args))) + return -EFAULT; + + argp = (void __user *)args[1]; + } switch (args[0]) { case BRCTL_ADD_IF: @@ -171,7 +188,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return -ENOMEM; get_port_ifindices(br, indices, num); - if (copy_to_user((void __user *)args[1], indices, num*sizeof(int))) + if (copy_to_user(argp, indices, num * sizeof(int))) num = -EFAULT; kfree(indices); return num; @@ -232,7 +249,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) rcu_read_unlock(); - if (copy_to_user((void __user *)args[1], &p, sizeof(p))) + if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; @@ -282,8 +299,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) } case BRCTL_GET_FDB_ENTRIES: - return get_fdb_entries(br, (void __user *)args[1], - args[2], args[3]); + return get_fdb_entries(br, argp, args[2], args[3]); } if (!ret) { @@ -320,7 +336,7 @@ static int old_deviceless(struct net *net, void __user *uarg) args[2] = get_bridge_ifindices(net, indices, args[2]); - ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int)) + ret = copy_to_user(uarg, indices, args[2]*sizeof(int)) ? -EFAULT : args[2]; kfree(indices); @@ -335,7 +351,7 @@ static int old_deviceless(struct net *net, void __user *uarg) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ)) + if (copy_from_user(buf, uarg, IFNAMSIZ)) return -EFAULT; buf[IFNAMSIZ-1] = 0; @@ -350,7 +366,8 @@ static int old_deviceless(struct net *net, void __user *uarg) return -EOPNOTSUPP; } -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) { switch (cmd) { case SIOCGIFBR: @@ -374,24 +391,11 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uar return br_del_bridge(net, buf); } - } - return -EOPNOTSUPP; -} - -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct net_bridge *br = netdev_priv(dev); - - switch (cmd) { - case SIOCDEVPRIVATE: - return old_dev_ioctl(dev, rq, cmd); case SIOCBRADDIF: case SIOCBRDELIF: - return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); + return add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); } - - br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd); return -EOPNOTSUPP; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 17a720b4473f..73a8915b0148 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -16,29 +16,29 @@ #include "br_private.h" -static bool br_rports_have_mc_router(struct net_bridge *br) +static bool br_rports_have_mc_router(struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) - return !hlist_empty(&br->ip4_mc_router_list) || - !hlist_empty(&br->ip6_mc_router_list); + return !hlist_empty(&brmctx->ip4_mc_router_list) || + !hlist_empty(&brmctx->ip6_mc_router_list); #else - return !hlist_empty(&br->ip4_mc_router_list); + return !hlist_empty(&brmctx->ip4_mc_router_list); #endif } static bool br_ip4_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) { - *timer = br_timer_value(&port->ip4_mc_router_timer); - return !hlist_unhashed(&port->ip4_rlist); + *timer = br_timer_value(&port->multicast_ctx.ip4_mc_router_timer); + return !hlist_unhashed(&port->multicast_ctx.ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) - *timer = br_timer_value(&port->ip6_mc_router_timer); - return !hlist_unhashed(&port->ip6_rlist); + *timer = br_timer_value(&port->multicast_ctx.ip6_mc_router_timer); + return !hlist_unhashed(&port->multicast_ctx.ip6_rlist); #else *timer = 0; return false; @@ -54,10 +54,10 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct nlattr *nest, *port_nest; struct net_bridge_port *p; - if (!br->multicast_router) + if (!br->multicast_ctx.multicast_router) return 0; - if (!br_rports_have_mc_router(br)) + if (!br_rports_have_mc_router(&br->multicast_ctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); @@ -79,7 +79,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, - p->multicast_router) || + p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || @@ -240,7 +240,7 @@ static int __mdb_fill_info(struct sk_buff *skb, switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) @@ -250,7 +250,7 @@ static int __mdb_fill_info(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) @@ -483,7 +483,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); - if (pg->key.port->br->multicast_igmp_version == 2) + if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; @@ -492,7 +492,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); - if (pg->key.port->br->multicast_mld_version == 1) + if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; @@ -617,6 +617,9 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, ASSERT_RTNL(); + if (!nb) + return 0; + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; @@ -686,7 +689,6 @@ out_free_mdb: return err; } -EXPORT_SYMBOL_GPL(br_mdb_replay); static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, @@ -781,12 +783,12 @@ errout: static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, - int ifindex, u32 pid, + int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { + struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; - struct nlattr *nest; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) @@ -800,8 +802,18 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb, if (!nest) goto cancel; - if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex)) + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto end; + if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { + nla_nest_cancel(skb, port_nest); goto end; + } + if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { + nla_nest_cancel(skb, port_nest); + goto end; + } + nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -817,23 +829,28 @@ cancel: static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) - + nla_total_size(sizeof(__u32)); + + nla_total_size(sizeof(__u32)) + + nla_total_size(sizeof(u16)); } -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; + u16 vid; - ifindex = port ? port->dev->ifindex : 0; + ifindex = pmctx ? pmctx->port->dev->ifindex : 0; + vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : + 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; - err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF); + err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, + NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; @@ -1004,14 +1021,47 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static struct net_bridge_mcast * +__br_mdb_choose_context(struct net_bridge *br, + const struct br_mdb_entry *entry, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mcast *brmctx = NULL; + struct net_bridge_vlan *v; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + brmctx = &br->multicast_ctx; + goto out; + } + + if (!entry->vid) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); + goto out; + } + + v = br_vlan_find(br_vlan_group(br), entry->vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); + goto out; + } + if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); + goto out; + } + brmctx = &v->br_mcast_ctx; +out: + return brmctx; +} + static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_mdb_entry *entry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; - struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mcast *brmctx; struct br_ip group, star_group; unsigned long now = jiffies; unsigned char flags = 0; @@ -1020,6 +1070,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + brmctx = __br_mdb_choose_context(br, entry, extack); + if (!brmctx) + return -EINVAL; + /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ @@ -1053,7 +1107,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EEXIST; } - br_multicast_host_join(mp, false); + br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; @@ -1084,14 +1138,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } rcu_assign_pointer(*pp, p); if (entry->state == MDB_TEMPORARY) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); /* if we are adding a new EXCLUDE port group (*,G) it needs to be also * added to all S,G entries for proper replication, if we are adding * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be * added to it for proper replication */ - if (br_multicast_should_handle_mode(br, group.proto)) { + if (br_multicast_should_handle_mode(brmctx, group.proto)) { switch (filter_mode) { case MCAST_EXCLUDE: br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d0434dc8c03b..470f1ec3b579 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -49,30 +49,30 @@ static const struct rhashtable_params br_sg_port_rht_params = { .automatic_shrinking = true, }; -static void br_multicast_start_querier(struct net_bridge *br, +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); -static void br_ip4_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); static void -br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); -static void br_ip6_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -80,6 +80,7 @@ __br_multicast_add_group(struct net_bridge *br, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); +static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, @@ -140,12 +141,14 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, } #endif -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge *br = brmctx->br; struct br_ip ip; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) @@ -158,7 +161,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; @@ -171,7 +174,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; @@ -190,6 +193,62 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return br_mdb_ip_get_rcu(br, &ip); } +/* IMPORTANT: this function must be used only when the contexts cannot be + * passed down (e.g. timer) and must be used for read-only purposes because + * the vlan snooping option can change, so it can return any context + * (non-vlan or vlan). Its initial intended purpose is to read timer values + * from the *current* context based on the option. At worst that could lead + * to inconsistent timers when the contexts are changed, i.e. src timer + * which needs to re-arm with a specific delay taken from the old context + */ +static struct net_bridge_mcast_port * +br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) +{ + struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; + struct net_bridge_vlan *vlan; + + lockdep_assert_held_once(&pg->key.port->br->multicast_lock); + + /* if vlan snooping is disabled use the port's multicast context */ + if (!pg->key.addr.vid || + !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + goto out; + + /* locking is tricky here, due to different rules for multicast and + * vlans we need to take rcu to find the vlan and make sure it has + * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under + * multicast_lock which must be already held here, so the vlan's pmctx + * can safely be used on return + */ + rcu_read_lock(); + vlan = br_vlan_find(nbp_vlan_group(pg->key.port), pg->key.addr.vid); + if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + pmctx = &vlan->port_mcast_ctx; + else + pmctx = NULL; + rcu_read_unlock(); +out: + return pmctx; +} + +/* when snooping we need to check if the contexts should be used + * in the following order: + * - if pmctx is non-NULL (port), check if it should be used + * - if pmctx is NULL (bridge), check if brmctx should be used + */ +static bool +br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, + const struct net_bridge_mcast_port *pmctx) +{ + if (!netif_running(brmctx->br->dev)) + return false; + + if (pmctx) + return !br_multicast_port_ctx_state_disabled(pmctx); + else + return !br_multicast_ctx_vlan_disabled(brmctx); +} + static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) @@ -203,20 +262,23 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } -static void __fwd_add_star_excl(struct net_bridge_port_group *pg, +static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; - struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; + struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; - if (br_sg_port_find(br, &sg_key)) + if (br_sg_port_find(brmctx->br, &sg_key)) return; - src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + src_pg = __br_multicast_add_group(brmctx, pmctx, + sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) @@ -256,6 +318,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; @@ -265,9 +328,13 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { @@ -284,7 +351,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: - __fwd_add_star_excl(pg, &sg_ip); + __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } @@ -377,7 +444,9 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; + struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; @@ -400,7 +469,12 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, if (br_sg_port_find(br, &sg_key)) continue; - src_pg = __br_multicast_add_group(br, pg->key.port, + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + continue; + brmctx = br_multicast_port_ctx_get_global(pmctx); + + src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); @@ -414,16 +488,23 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; + struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); + pmctx = br_multicast_pg_to_port_ctx(src->pg); + if (!pmctx) + return; + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; - sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, + + sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) @@ -692,7 +773,28 @@ static void br_multicast_gc(struct hlist_head *head) } } -static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, +static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct sk_buff *skb) +{ + struct net_bridge_vlan *vlan = NULL; + + if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) + vlan = pmctx->vlan; + else if (br_multicast_ctx_is_vlan(brmctx)) + vlan = brmctx->vlan; + + if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { + u16 vlan_proto; + + if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) + return; + __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); + } +} + +static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, @@ -714,11 +816,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { - lmqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + lmqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && @@ -734,19 +836,20 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); @@ -762,8 +865,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; - iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? - inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; + iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? + inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; @@ -775,12 +878,12 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; - switch (br->multicast_igmp_version) { + switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ih->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; @@ -790,11 +893,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; - ihv3->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ihv3->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; - ihv3->qqic = br->multicast_query_interval / HZ; + ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; @@ -837,7 +940,8 @@ out: } #if IS_ENABLED(CONFIG_IPV6) -static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, @@ -862,11 +966,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, u8 *hopopt; mld_hdr_size = sizeof(*mldq); - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { - llqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + llqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && @@ -882,20 +986,21 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); @@ -908,14 +1013,14 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; - if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, - &ip6h->saddr)) { + if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, + &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); @@ -933,10 +1038,10 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? - br->multicast_query_response_interval : - br->multicast_last_member_interval; + brmctx->multicast_query_response_interval : + brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; - switch (br->multicast_mld_version) { + switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; @@ -959,7 +1064,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); - mld2q->mld2q_qqic = br->multicast_query_interval / HZ; + mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; @@ -1000,7 +1105,8 @@ out: } #endif -static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1013,7 +1119,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); - return br_ip4_multicast_alloc_query(br, pg, + return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, @@ -1028,7 +1134,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); - return br_ip6_multicast_alloc_query(br, pg, + return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, @@ -1206,7 +1312,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( return p; } -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; @@ -1219,7 +1326,7 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (br_group_is_l2(&mp->addr)) return; - mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); + mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) @@ -1235,8 +1342,8 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) } static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1248,29 +1355,28 @@ __br_multicast_add_group(struct net_bridge *br, struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); - if (!port) { - br_multicast_host_join(mp, true); + if (!pmctx) { + br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (br_port_group_equal(p, port, src)) + if (br_port_group_equal(p, pmctx->port, src)) goto found; - if ((unsigned long)p->key.port < (unsigned long)port) + if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, + p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); @@ -1279,18 +1385,19 @@ __br_multicast_add_group(struct net_bridge *br, rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; - br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); out: return p; } -static int br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1299,18 +1406,18 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port_group *pg; int err; - spin_lock(&br->multicast_lock); - pg = __br_multicast_add_group(br, port, group, src, filter_mode, + spin_lock(&brmctx->br->multicast_lock); + pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } -static int br_ip4_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, @@ -1328,13 +1435,13 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - igmpv2); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, @@ -1352,8 +1459,8 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - mldv1); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, mldv1); } #endif @@ -1366,52 +1473,54 @@ static bool br_multicast_rport_del(struct hlist_node *rlist) return true; } -static bool br_ip4_multicast_rport_del(struct net_bridge_port *p) +static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { - return br_multicast_rport_del(&p->ip4_rlist); + return br_multicast_rport_del(&pmctx->ip4_rlist); } -static bool br_ip6_multicast_rport_del(struct net_bridge_port *p) +static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - return br_multicast_rport_del(&p->ip6_rlist); + return br_multicast_rport_del(&pmctx->ip6_rlist); #else return false; #endif } -static void br_multicast_router_expired(struct net_bridge_port *port, +static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, struct timer_list *t, struct hlist_node *rlist) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; bool del; spin_lock(&br->multicast_lock); - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM || + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(t)) goto out; del = br_multicast_rport_del(rlist); - br_multicast_rport_del_notify(port, del); + br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_router_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip4_mc_router_timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_mc_router_timer); - br_multicast_router_expired(port, t, &port->ip4_rlist); + br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip6_mc_router_timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_mc_router_timer); - br_multicast_router_expired(port, t, &port->ip6_rlist); + br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } #endif @@ -1428,80 +1537,86 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -static void br_multicast_local_router_expired(struct net_bridge *br, +static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, struct timer_list *timer) { - spin_lock(&br->multicast_lock); - if (br->multicast_router == MDB_RTR_TYPE_DISABLED || - br->multicast_router == MDB_RTR_TYPE_PERM || - br_ip4_multicast_is_router(br) || - br_ip6_multicast_is_router(br)) + spin_lock(&brmctx->br->multicast_lock); + if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + brmctx->multicast_router == MDB_RTR_TYPE_PERM || + br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx)) goto out; - br_mc_router_state_change(br, false); + br_mc_router_state_change(brmctx->br, false); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_mc_router_timer); - br_multicast_local_router_expired(br, t); + br_multicast_local_router_expired(brmctx, t); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_mc_router_timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_mc_router_timer); - br_multicast_local_router_expired(br, t); + br_multicast_local_router_expired(brmctx, t); } #endif -static void br_multicast_querier_expired(struct net_bridge *br, +static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!netif_running(brmctx->br->dev) || + br_multicast_ctx_vlan_global_disabled(brmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; - br_multicast_start_querier(br, query); + br_multicast_start_querier(brmctx, query); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_other_query.timer); - br_multicast_querier_expired(br, &br->ip4_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_other_query.timer); - br_multicast_querier_expired(br, &br->ip6_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif -static void br_multicast_select_own_querier(struct net_bridge *br, +static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) - br->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; + brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else - br->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; + brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } -static void __br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1513,19 +1628,22 @@ static void __br_multicast_send_query(struct net_bridge *br, struct sk_buff *skb; u8 igmp_type; + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + return; + again_under_lmqt: - skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs, - over_lmqt, sflag, &igmp_type, + skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, + with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; - if (port) { - skb->dev = port->dev; - br_multicast_count(br, port, skb, igmp_type, + if (pmctx) { + skb->dev = pmctx->port->dev; + br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, - dev_net(port->dev), NULL, skb, NULL, skb->dev, + dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { @@ -1533,35 +1651,35 @@ again_under_lmqt: goto again_under_lmqt; } } else { - br_multicast_select_own_querier(br, group, skb); - br_multicast_count(br, port, skb, igmp_type, + br_multicast_select_own_querier(brmctx, group, skb); + br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } -static void br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; struct br_ip br_group; unsigned long time; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED) || - !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + if (!br_multicast_ctx_should_use(brmctx, pmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_QUERIER)) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); - if (port ? (own_query == &port->ip4_own_query) : - (own_query == &br->ip4_own_query)) { - other_query = &br->ip4_other_query; + if (pmctx ? (own_query == &pmctx->ip4_own_query) : + (own_query == &brmctx->ip4_own_query)) { + other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } @@ -1569,31 +1687,32 @@ static void br_multicast_send_query(struct net_bridge *br, if (!other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0, - NULL); + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, + 0, NULL); time = jiffies; - time += own_query->startup_sent < br->multicast_startup_query_count ? - br->multicast_startup_query_interval : - br->multicast_query_interval; + time += own_query->startup_sent < brmctx->multicast_startup_query_count ? + brmctx->multicast_startup_query_interval : + brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void -br_multicast_port_query_expired(struct net_bridge_port *port, +br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; - if (query->startup_sent < br->multicast_startup_query_count) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; - br_multicast_send_query(port->br, port, query); + br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); @@ -1601,17 +1720,19 @@ out: static void br_ip4_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_own_query.timer); - br_multicast_port_query_expired(port, &port->ip4_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_own_query.timer); - br_multicast_port_query_expired(port, &port->ip6_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif @@ -1620,6 +1741,8 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); @@ -1628,11 +1751,15 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) !br_opt_get(br, BROPT_MULTICAST_QUERIER)) goto out; + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + goto out; + brmctx = br_multicast_port_ctx_get_global(pmctx); if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) @@ -1640,15 +1767,15 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + - br->multicast_last_member_interval); + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } @@ -1666,23 +1793,40 @@ static int br_mc_disabled_update(struct net_device *dev, bool value, return switchdev_port_attr_set(dev, &attr, extack); } -int br_multicast_add_port(struct net_bridge_port *port) +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) { - int err; - - port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; - - timer_setup(&port->ip4_mc_router_timer, + pmctx->port = port; + pmctx->vlan = vlan; + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + timer_setup(&pmctx->ip4_mc_router_timer, br_ip4_multicast_router_expired, 0); - timer_setup(&port->ip4_own_query.timer, + timer_setup(&pmctx->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - timer_setup(&port->ip6_mc_router_timer, + timer_setup(&pmctx->ip6_mc_router_timer, br_ip6_multicast_router_expired, 0); - timer_setup(&port->ip6_own_query.timer, + timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif +} + +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&pmctx->ip6_mc_router_timer); +#endif + del_timer_sync(&pmctx->ip4_mc_router_timer); +} + +int br_multicast_add_port(struct net_bridge_port *port) +{ + int err; + + port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; + br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); + err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), @@ -1711,10 +1855,7 @@ void br_multicast_del_port(struct net_bridge_port *port) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); - del_timer_sync(&port->ip4_mc_router_timer); -#if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&port->ip6_mc_router_timer); -#endif + br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } @@ -1727,20 +1868,23 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query) mod_timer(&query->timer, jiffies); } -static void __br_multicast_enable_port(struct net_bridge_port *port) +static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + !netif_running(br->dev)) return; - br_multicast_enable(&port->ip4_own_query); + br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(&pmctx->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM) { - br_ip4_multicast_add_router(br, port); - br_ip6_multicast_add_router(br, port); + if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { + br_ip4_multicast_add_router(brmctx, pmctx); + br_ip6_multicast_add_router(brmctx, pmctx); } } @@ -1748,33 +1892,39 @@ void br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; - spin_lock(&br->multicast_lock); - __br_multicast_enable_port(port); - spin_unlock(&br->multicast_lock); + spin_lock_bh(&br->multicast_lock); + __br_multicast_enable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&br->multicast_lock); } -void br_multicast_disable_port(struct net_bridge_port *port) +static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; bool del = false; - spin_lock(&br->multicast_lock); - hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) - br_multicast_find_del_pg(br, pg); + hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && + (!br_multicast_port_ctx_is_vlan(pmctx) || + pg->key.addr.vid == pmctx->vlan->vid)) + br_multicast_find_del_pg(pmctx->port->br, pg); - del |= br_ip4_multicast_rport_del(port); - del_timer(&port->ip4_mc_router_timer); - del_timer(&port->ip4_own_query.timer); - del |= br_ip6_multicast_rport_del(port); + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del_timer(&pmctx->ip4_own_query.timer); + del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&port->ip6_mc_router_timer); - del_timer(&port->ip6_own_query.timer); + del_timer(&pmctx->ip6_mc_router_timer); + del_timer(&pmctx->ip6_own_query.timer); #endif - br_multicast_rport_del_notify(port, del); - spin_unlock(&br->multicast_lock); + br_multicast_rport_del_notify(pmctx, del); +} + +void br_multicast_disable_port(struct net_bridge_port *port) +{ + spin_lock_bh(&port->br->multicast_lock); + __br_multicast_disable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&port->br->multicast_lock); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -1799,31 +1949,33 @@ static void __grp_src_mod_timer(struct net_bridge_group_src *src, br_multicast_fwd_src_handle(src); } -static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; - u32 lmqc = br->multicast_last_member_count; + u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - lmqt = now + br_multicast_lmqt(br); + lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (br_opt_get(brmctx->br, + BROPT_MULTICAST_QUERIER) && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; @@ -1832,41 +1984,42 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) } } - if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) || + if (!br_opt_get(brmctx->br, BROPT_MULTICAST_QUERIER) || !other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); - lmi = now + br->multicast_last_member_interval; + lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } -static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; unsigned long now = jiffies, lmi; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (br_opt_get(brmctx->br, BROPT_MULTICAST_QUERIER) && other_query && !timer_pending(&other_query->timer)) { - lmi = now + br->multicast_last_member_interval; - pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + lmi = now + brmctx->multicast_last_member_interval; + pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) @@ -1875,8 +2028,8 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || - time_after(pg->timer.expires, now + br_multicast_lmqt(br)))) - mod_timer(&pg->timer, now + br_multicast_lmqt(br)); + time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) + mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions @@ -1884,11 +2037,11 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ -static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1907,10 +2060,11 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; return changed; @@ -1921,7 +2075,8 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a * Delete (A-B) * Group Timer=GMI */ -static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -1945,7 +2100,8 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); } @@ -1956,11 +2112,11 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, * Delete (Y-A) * Group Timer=GMI */ -static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1981,13 +2137,14 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, - now + br_multicast_gmi(br)); + now + br_multicast_gmi(brmctx)); changed = true; } } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) @@ -1996,28 +2153,28 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, return changed; } -static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_isexc_incl(pg, h_addr, srcs, nsrcs, addr_size, + __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_isexc_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -2026,11 +2183,12 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ -static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -2054,14 +2212,15 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2071,11 +2230,12 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,X-A) * Send Q(G) */ -static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -2102,21 +2262,24 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); - __grp_send_query_and_rexmit(pg); + __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toin(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2124,12 +2287,12 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_toin_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_toin_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2151,7 +2314,9 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A*B) * Group Timer=GMI */ -static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2178,11 +2343,12 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions @@ -2192,7 +2358,9 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A-Y) * Group Timer=GMI */ -static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2224,39 +2392,41 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toex(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_toex_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_toex_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -2264,7 +2434,9 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ -static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2286,11 +2458,12 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2299,7 +2472,9 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ -static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2328,28 +2503,31 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_block(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_block(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_block_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_block_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2384,12 +2562,12 @@ br_multicast_find_port(struct net_bridge_mdb_entry *mp, return NULL; } -static int br_ip4_multicast_igmp3_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool igmpv2 = br->multicast_igmp_version == 2; + bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; @@ -2436,25 +2614,29 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { - if (!port || igmpv2) { - br_ip4_multicast_leave_group(br, port, group, vid, src); + if (!pmctx || igmpv2) { + br_ip4_multicast_leave_group(brmctx, pmctx, + group, vid, src); continue; } } else { - err = br_ip4_multicast_add_group(br, port, group, vid, - src, igmpv2); + err = br_ip4_multicast_add_group(brmctx, pmctx, group, + vid, src, igmpv2); if (err) break; } - if (!port || igmpv2) + if (!pmctx || igmpv2) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip4_get(br, group, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ @@ -2462,46 +2644,52 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, grec->grec_src, + changed = br_multicast_isexc(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, grec->grec_src, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, grec->grec_src, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, grec->grec_src, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_mld2_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool mldv1 = br->multicast_mld_version == 1; + bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; @@ -2562,137 +2750,144 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { - if (!port || mldv1) { - br_ip6_multicast_leave_group(br, port, + if (!pmctx || mldv1) { + br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { - err = br_ip6_multicast_add_group(br, port, + err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } - if (!port || mldv1) + if (!pmctx || mldv1) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, + changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #endif -static bool br_ip4_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, +static bool br_ip4_multicast_select_querier(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 saddr) { - if (!timer_pending(&br->ip4_own_query.timer) && - !timer_pending(&br->ip4_other_query.timer)) + struct net_bridge_port *port = pmctx ? pmctx->port : NULL; + + if (!timer_pending(&brmctx->ip4_own_query.timer) && + !timer_pending(&brmctx->ip4_other_query.timer)) goto update; - if (!br->ip4_querier.addr.src.ip4) + if (!brmctx->ip4_querier.addr.src.ip4) goto update; - if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.src.ip4)) + if (ntohl(saddr) <= ntohl(brmctx->ip4_querier.addr.src.ip4)) goto update; return false; update: - br->ip4_querier.addr.src.ip4 = saddr; + brmctx->ip4_querier.addr.src.ip4 = saddr; /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip4_querier.port, port); + rcu_assign_pointer(brmctx->ip4_querier.port, port); return true; } #if IS_ENABLED(CONFIG_IPV6) -static bool br_ip6_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, +static bool br_ip6_multicast_select_querier(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct in6_addr *saddr) { - if (!timer_pending(&br->ip6_own_query.timer) && - !timer_pending(&br->ip6_other_query.timer)) + struct net_bridge_port *port = pmctx ? pmctx->port : NULL; + + if (!timer_pending(&brmctx->ip6_own_query.timer) && + !timer_pending(&brmctx->ip6_other_query.timer)) goto update; - if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.src.ip6) <= 0) + if (ipv6_addr_cmp(saddr, &brmctx->ip6_querier.addr.src.ip6) <= 0) goto update; return false; update: - br->ip6_querier.addr.src.ip6 = *saddr; + brmctx->ip6_querier.addr.src.ip6 = *saddr; /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip6_querier.port, port); + rcu_assign_pointer(brmctx->ip6_querier.port, port); return true; } #endif static void -br_multicast_update_query_timer(struct net_bridge *br, +br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) query->delay_time = jiffies + max_delay; - mod_timer(&query->timer, jiffies + br->multicast_querier_interval); + mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, @@ -2709,19 +2904,26 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, } static struct net_bridge_port * -br_multicast_rport_from_node(struct net_bridge *br, +br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, struct hlist_head *mc_router_list, struct hlist_node *rlist) { + struct net_bridge_mcast_port *pmctx; + #if IS_ENABLED(CONFIG_IPV6) - if (mc_router_list == &br->ip6_mc_router_list) - return hlist_entry(rlist, struct net_bridge_port, ip6_rlist); + if (mc_router_list == &brmctx->ip6_mc_router_list) + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip6_rlist); + else #endif - return hlist_entry(rlist, struct net_bridge_port, ip4_rlist); + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip4_rlist); + + return pmctx->port; } static struct hlist_node * -br_multicast_get_rport_slot(struct net_bridge *br, +br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, struct net_bridge_port *port, struct hlist_head *mc_router_list) @@ -2731,7 +2933,7 @@ br_multicast_get_rport_slot(struct net_bridge *br, struct hlist_node *rlist; hlist_for_each(rlist, mc_router_list) { - p = br_multicast_rport_from_node(br, mc_router_list, rlist); + p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); if ((unsigned long)port >= (unsigned long)p) break; @@ -2742,14 +2944,14 @@ br_multicast_get_rport_slot(struct net_bridge *br, return slot; } -static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, +static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, struct hlist_node *rnode) { #if IS_ENABLED(CONFIG_IPV6) - if (rnode != &port->ip6_rlist) - return hlist_unhashed(&port->ip6_rlist); + if (rnode != &pmctx->ip6_rlist) + return hlist_unhashed(&pmctx->ip6_rlist); else - return hlist_unhashed(&port->ip4_rlist); + return hlist_unhashed(&pmctx->ip4_rlist); #else return true; #endif @@ -2759,8 +2961,8 @@ static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct hlist_node *rlist, struct hlist_head *mc_router_list) { @@ -2769,7 +2971,7 @@ static void br_multicast_add_router(struct net_bridge *br, if (!hlist_unhashed(rlist)) return; - slot = br_multicast_get_rport_slot(br, port, mc_router_list); + slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) hlist_add_behind_rcu(rlist, slot); @@ -2780,9 +2982,9 @@ static void br_multicast_add_router(struct net_bridge *br, * switched from no IPv4/IPv6 multicast router to a new * IPv4 or IPv6 multicast router. */ - if (br_multicast_no_router_otherpf(port, rlist)) { - br_rtr_notify(br->dev, port, RTM_NEWMDB); - br_port_mc_router_state_change(port, true); + if (br_multicast_no_router_otherpf(pmctx, rlist)) { + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); + br_port_mc_router_state_change(pmctx->port, true); } } @@ -2790,116 +2992,119 @@ static void br_multicast_add_router(struct net_bridge *br, * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_ip4_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { - br_multicast_add_router(br, port, &port->ip4_rlist, - &br->ip4_mc_router_list); + br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, + &brmctx->ip4_mc_router_list); } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_ip6_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - br_multicast_add_router(br, port, &port->ip6_rlist, - &br->ip6_mc_router_list); + br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, + &brmctx->ip6_mc_router_list); #endif } -static void br_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct timer_list *timer, struct hlist_node *rlist, struct hlist_head *mc_router_list) { unsigned long now = jiffies; - if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!br_ip4_multicast_is_router(br) && - !br_ip6_multicast_is_router(br)) - br_mc_router_state_change(br, true); - mod_timer(timer, now + br->multicast_querier_interval); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + return; + + if (!pmctx) { + if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!br_ip4_multicast_is_router(brmctx) && + !br_ip6_multicast_is_router(brmctx)) + br_mc_router_state_change(brmctx->br, true); + mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM) + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; - br_multicast_add_router(br, port, rlist, mc_router_list); - mod_timer(timer, now + br->multicast_querier_interval); + br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); + mod_timer(timer, now + brmctx->multicast_querier_interval); } -static void br_ip4_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { - struct timer_list *timer = &br->ip4_mc_router_timer; + struct timer_list *timer = &brmctx->ip4_mc_router_timer; struct hlist_node *rlist = NULL; - if (port) { - timer = &port->ip4_mc_router_timer; - rlist = &port->ip4_rlist; + if (pmctx) { + timer = &pmctx->ip4_mc_router_timer; + rlist = &pmctx->ip4_rlist; } - br_multicast_mark_router(br, port, timer, rlist, - &br->ip4_mc_router_list); + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip4_mc_router_list); } -static void br_ip6_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - struct timer_list *timer = &br->ip6_mc_router_timer; + struct timer_list *timer = &brmctx->ip6_mc_router_timer; struct hlist_node *rlist = NULL; - if (port) { - timer = &port->ip6_mc_router_timer; - rlist = &port->ip6_rlist; + if (pmctx) { + timer = &pmctx->ip6_mc_router_timer; + rlist = &pmctx->ip6_rlist; } - br_multicast_mark_router(br, port, timer, rlist, - &br->ip6_mc_router_list); + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip6_mc_router_list); #endif } static void -br_ip4_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, +br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { - if (!br_ip4_multicast_select_querier(br, port, saddr->src.ip4)) + if (!br_ip4_multicast_select_querier(brmctx, pmctx, saddr->src.ip4)) return; - br_multicast_update_query_timer(br, query, max_delay); - br_ip4_multicast_mark_router(br, port); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip4_multicast_mark_router(brmctx, pmctx); } #if IS_ENABLED(CONFIG_IPV6) static void -br_ip6_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, +br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { - if (!br_ip6_multicast_select_querier(br, port, &saddr->src.ip6)) + if (!br_ip6_multicast_select_querier(brmctx, pmctx, &saddr->src.ip6)) return; - br_multicast_update_query_timer(br, query, max_delay); - br_ip6_multicast_mark_router(br, port); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip6_multicast_mark_router(brmctx, pmctx); } #endif -static void br_ip4_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2915,9 +3120,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, unsigned long now = jiffies; __be32 group; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; @@ -2932,7 +3136,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || - (br->multicast_igmp_version == 3 && group && ih3->suppress)) + (brmctx->multicast_igmp_version == 3 && group && + ih3->suppress)) goto out; max_delay = ih3->code ? @@ -2945,16 +3150,17 @@ static void br_ip4_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; - br_ip4_multicast_query_received(br, port, &br->ip4_other_query, + br_ip4_multicast_query_received(brmctx, pmctx, + &brmctx->ip4_other_query, &saddr, max_delay); goto out; } - mp = br_mdb_ip4_get(br, group, vid); + mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? @@ -2963,23 +3169,23 @@ static void br_ip4_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_igmp_version == 2 || + (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2997,9 +3203,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, bool is_general_query; int err = 0; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { @@ -3019,7 +3224,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - if (br->multicast_mld_version == 2 && + if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; @@ -3033,18 +3238,19 @@ static int br_ip6_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; - br_ip6_multicast_query_received(br, port, &br->ip6_other_query, + br_ip6_multicast_query_received(brmctx, pmctx, + &brmctx->ip6_other_query, &saddr, max_delay); goto out; } else if (!group) { goto out; } - mp = br_mdb_ip6_get(br, group, vid); + mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : @@ -3052,25 +3258,25 @@ static int br_ip6_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_mld_version == 1 || + (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void -br_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +br_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, @@ -3081,22 +3287,21 @@ br_multicast_leave_group(struct net_bridge *br, unsigned long now; unsigned long time; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_mdb_ip_get(br, group); + mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; - if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { + if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (!br_port_group_equal(p, port, src)) + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) @@ -3111,19 +3316,19 @@ br_multicast_leave_group(struct net_bridge *br, if (timer_pending(&other_query->timer)) goto out; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - __br_multicast_send_query(br, port, NULL, NULL, &mp->addr, + if (br_opt_get(brmctx->br, BROPT_MULTICAST_QUERIER)) { + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); - time = jiffies + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = jiffies + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); - for (p = mlock_dereference(mp->ports, br); - p != NULL; - p = mlock_dereference(p->next, br)) { - if (!br_port_group_equal(p, port, src)) + for (p = mlock_dereference(mp->ports, brmctx->br); + p != NULL && pmctx != NULL; + p = mlock_dereference(p->next, brmctx->br)) { + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && @@ -3138,10 +3343,10 @@ br_multicast_leave_group(struct net_bridge *br, } now = jiffies; - time = now + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = now + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; - if (!port) { + if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : @@ -3152,10 +3357,10 @@ br_multicast_leave_group(struct net_bridge *br, goto out; } - for (p = mlock_dereference(mp->ports, br); + for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; - p = mlock_dereference(p->next, br)) { - if (p->key.port != port) + p = mlock_dereference(p->next, brmctx->br)) { + if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && @@ -3168,11 +3373,11 @@ br_multicast_leave_group(struct net_bridge *br, break; } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) @@ -3183,20 +3388,21 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, if (ipv4_is_local_multicast(group)) return; - own_query = port ? &port->ip4_own_query : &br->ip4_own_query; + own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) @@ -3207,14 +3413,15 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return; - own_query = port ? &port->ip6_own_query : &br->ip6_own_query; + own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip6_other_query, own_query, src); } #endif @@ -3252,8 +3459,8 @@ static void br_multicast_err_count(const struct net_bridge *br, u64_stats_update_end(&pstats->syncp); } -static void br_multicast_pim(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_pim(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); @@ -3264,31 +3471,32 @@ static void br_multicast_pim(struct net_bridge *br, pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; - spin_lock(&br->multicast_lock); - br_ip4_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; - spin_lock(&br->multicast_lock); - br_ip4_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); return 0; } -static int br_multicast_ipv4_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; @@ -3300,14 +3508,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) - br_multicast_pim(br, port, skb); + br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { - br_ip4_multicast_mrd_rcv(br, port, skb); + br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3319,44 +3527,45 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group, vid, src, - true); + err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, + src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb, vid); + err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - br_ip4_multicast_query(br, port, skb, vid); + br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: - br_ip4_multicast_leave_group(br, port, ih->group, vid, src); + br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - spin_lock(&br->multicast_lock); - br_ip6_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip6_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_multicast_ipv6_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; @@ -3368,11 +3577,11 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) - br_ip6_multicast_mrd_rcv(br, port, skb); + br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3383,29 +3592,32 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, - src, true); + err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, + vid, src, true); break; case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb, vid); + err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb, vid); + err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; - br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src); + br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, + src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; @@ -3413,16 +3625,36 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; + if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { + const struct net_bridge_vlan *masterv; + + /* the vlan has the master flag set only when transmitting + * through the bridge device + */ + if (br_vlan_is_master(vlan)) { + masterv = vlan; + *brmctx = &vlan->br_mcast_ctx; + *pmctx = NULL; + } else { + masterv = vlan->brvlan; + *brmctx = &vlan->brvlan->br_mcast_ctx; + *pmctx = &vlan->port_mcast_ctx; + } + + if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return 0; + } + switch (skb->protocol) { case htons(ETH_P_IP): - ret = br_multicast_ipv4_rcv(br, port, skb, vid); + ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - ret = br_multicast_ipv6_rcv(br, port, skb, vid); + ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } @@ -3430,32 +3662,40 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, return ret; } -static void br_multicast_query_expired(struct net_bridge *br, +static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query, struct bridge_mcast_querier *querier) { - spin_lock(&br->multicast_lock); - if (query->startup_sent < br->multicast_startup_query_count) + spin_lock(&brmctx->br->multicast_lock); + if (br_multicast_ctx_vlan_disabled(brmctx)) + goto out; + + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; RCU_INIT_POINTER(querier->port, NULL); - br_multicast_send_query(br, NULL, query); - spin_unlock(&br->multicast_lock); + br_multicast_send_query(brmctx, NULL, query); +out: + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_own_query.timer); - br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, + &brmctx->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_own_query.timer); - br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, + &brmctx->ip6_querier); } #endif @@ -3472,47 +3712,63 @@ static void br_multicast_gc_work(struct work_struct *work) br_multicast_gc(&deleted_head); } -void br_multicast_init(struct net_bridge *br) +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) { - br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; - - br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - br->multicast_last_member_count = 2; - br->multicast_startup_query_count = 2; + brmctx->br = br; + brmctx->vlan = vlan; + brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + brmctx->multicast_last_member_count = 2; + brmctx->multicast_startup_query_count = 2; - br->multicast_last_member_interval = HZ; - br->multicast_query_response_interval = 10 * HZ; - br->multicast_startup_query_interval = 125 * HZ / 4; - br->multicast_query_interval = 125 * HZ; - br->multicast_querier_interval = 255 * HZ; - br->multicast_membership_interval = 260 * HZ; + brmctx->multicast_last_member_interval = HZ; + brmctx->multicast_query_response_interval = 10 * HZ; + brmctx->multicast_startup_query_interval = 125 * HZ / 4; + brmctx->multicast_query_interval = 125 * HZ; + brmctx->multicast_querier_interval = 255 * HZ; + brmctx->multicast_membership_interval = 260 * HZ; - br->ip4_other_query.delay_time = 0; - br->ip4_querier.port = NULL; - br->multicast_igmp_version = 2; + brmctx->ip4_other_query.delay_time = 0; + brmctx->ip4_querier.port = NULL; + brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) - br->multicast_mld_version = 1; - br->ip6_other_query.delay_time = 0; - br->ip6_querier.port = NULL; + brmctx->multicast_mld_version = 1; + brmctx->ip6_other_query.delay_time = 0; + brmctx->ip6_querier.port = NULL; #endif - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); - spin_lock_init(&br->multicast_lock); - timer_setup(&br->ip4_mc_router_timer, + timer_setup(&brmctx->ip4_mc_router_timer, br_ip4_multicast_local_router_expired, 0); - timer_setup(&br->ip4_other_query.timer, + timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); - timer_setup(&br->ip4_own_query.timer, + timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - timer_setup(&br->ip6_mc_router_timer, + timer_setup(&brmctx->ip6_mc_router_timer, br_ip6_multicast_local_router_expired, 0); - timer_setup(&br->ip6_other_query.timer, + timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); - timer_setup(&br->ip6_own_query.timer, + timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif +} + +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ + __br_multicast_stop(brmctx); +} + +void br_multicast_init(struct net_bridge *br) +{ + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; + + br_multicast_ctx_init(br, NULL, &br->multicast_ctx); + + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + + spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); @@ -3580,8 +3836,8 @@ void br_multicast_leave_snoopers(struct net_bridge *br) br_ip6_multicast_leave_snoopers(br); } -static void __br_multicast_open(struct net_bridge *br, - struct bridge_mcast_own_query *query) +static void __br_multicast_open_query(struct net_bridge *br, + struct bridge_mcast_own_query *query) { query->startup_sent = 0; @@ -3591,26 +3847,191 @@ static void __br_multicast_open(struct net_bridge *br, mod_timer(&query->timer, jiffies); } -void br_multicast_open(struct net_bridge *br) +static void __br_multicast_open(struct net_bridge_mcast *brmctx) { - __br_multicast_open(br, &br->ip4_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - __br_multicast_open(br, &br->ip6_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } -void br_multicast_stop(struct net_bridge *br) +void br_multicast_open(struct net_bridge *br) +{ + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_open(&vlan->br_mcast_ctx); + } + } + } + + __br_multicast_open(&br->multicast_ctx); +} + +static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { - del_timer_sync(&br->ip4_mc_router_timer); - del_timer_sync(&br->ip4_other_query.timer); - del_timer_sync(&br->ip4_own_query.timer); + del_timer_sync(&brmctx->ip4_mc_router_timer); + del_timer_sync(&brmctx->ip4_other_query.timer); + del_timer_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&br->ip6_mc_router_timer); - del_timer_sync(&br->ip6_other_query.timer); - del_timer_sync(&br->ip6_own_query.timer); + del_timer_sync(&brmctx->ip6_mc_router_timer); + del_timer_sync(&brmctx->ip6_other_query.timer); + del_timer_sync(&brmctx->ip6_own_query.timer); #endif } +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge *br; + + /* it's okay to check for the flag without the multicast lock because it + * can only change under RTNL -> multicast_lock, we need the latter to + * sync with timers and packets + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) + return; + + if (br_vlan_is_master(vlan)) { + br = vlan->br; + + if (!br_vlan_is_brentry(vlan) || + (on && + br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) + return; + + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + spin_unlock_bh(&br->multicast_lock); + + if (on) + __br_multicast_open(&vlan->br_mcast_ctx); + else + __br_multicast_stop(&vlan->br_mcast_ctx); + } else { + struct net_bridge_mcast *brmctx; + + brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); + if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) + return; + + br = vlan->port->br; + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + if (on) + __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); + else + __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); + spin_unlock_bh(&br->multicast_lock); + } +} + +void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge_port *p; + + if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) + return; + + list_for_each_entry(p, &vlan->br->port_list, list) { + struct net_bridge_vlan *vport; + + vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); + if (!vport) + continue; + br_multicast_toggle_one_vlan(vport, on); + } +} + +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + struct net_bridge_port *p; + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) + return 0; + + if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); + return -EINVAL; + } + + vg = br_vlan_group(br); + if (!vg) + return 0; + + br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); + + /* disable/enable non-vlan mcast contexts based on vlan snooping */ + if (on) + __br_multicast_stop(&br->multicast_ctx); + else + __br_multicast_open(&br->multicast_ctx); + list_for_each_entry(p, &br->port_list, list) { + if (on) + br_multicast_disable_port(p); + else + br_multicast_enable_port(p); + } + + list_for_each_entry(vlan, &vg->vlan_list, vlist) + br_multicast_toggle_vlan(vlan, on); + + return 0; +} + +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) +{ + ASSERT_RTNL(); + + /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and + * requires only RTNL to change + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return false; + + vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; + br_multicast_toggle_vlan(vlan, on); + + return true; +} + +void br_multicast_stop(struct net_bridge *br) +{ + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_stop(&vlan->br_mcast_ctx); + } + } + } + + __br_multicast_stop(&br->multicast_ctx); +} + void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; @@ -3623,6 +4044,7 @@ void br_multicast_dev_del(struct net_bridge *br) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); + br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); @@ -3631,6 +4053,7 @@ void br_multicast_dev_del(struct net_bridge *br) int br_multicast_set_router(struct net_bridge *br, unsigned long val) { + struct net_bridge_mcast *brmctx = &br->multicast_ctx; int err = -EINVAL; spin_lock_bh(&br->multicast_lock); @@ -3639,17 +4062,17 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); - del_timer(&br->ip4_mc_router_timer); + del_timer(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&br->ip6_mc_router_timer); + del_timer(&brmctx->ip6_mc_router_timer); #endif - br->multicast_router = val; + brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: - if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) br_mc_router_state_change(br, false); - br->multicast_router = val; + brmctx->multicast_router = val; err = 0; break; } @@ -3660,7 +4083,7 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) } static void -br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { if (!deleted) return; @@ -3668,37 +4091,38 @@ br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) /* For backwards compatibility for now, only notify if there is * no multicast router anymore for both IPv4 and IPv6. */ - if (!hlist_unhashed(&p->ip4_rlist)) + if (!hlist_unhashed(&pmctx->ip4_rlist)) return; #if IS_ENABLED(CONFIG_IPV6) - if (!hlist_unhashed(&p->ip6_rlist)) + if (!hlist_unhashed(&pmctx->ip6_rlist)) return; #endif - br_rtr_notify(p->br->dev, p, RTM_DELMDB); - br_port_mc_router_state_change(p, false); + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); + br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { - struct net_bridge *br = p->br; + struct net_bridge_mcast *brmctx = &p->br->multicast_ctx; + struct net_bridge_mcast_port *pmctx = &p->multicast_ctx; unsigned long now = jiffies; int err = -EINVAL; bool del = false; - spin_lock(&br->multicast_lock); - if (p->multicast_router == val) { + spin_lock(&p->br->multicast_lock); + if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) { - mod_timer(&p->ip4_mc_router_timer, - now + br->multicast_querier_interval); + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { + mod_timer(&pmctx->ip4_mc_router_timer, + now + brmctx->multicast_querier_interval); #if IS_ENABLED(CONFIG_IPV6) - mod_timer(&p->ip6_mc_router_timer, - now + br->multicast_querier_interval); + mod_timer(&pmctx->ip6_mc_router_timer, + now + brmctx->multicast_querier_interval); #endif } err = 0; @@ -3706,63 +4130,86 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) } switch (val) { case MDB_RTR_TYPE_DISABLED: - p->multicast_router = MDB_RTR_TYPE_DISABLED; - del |= br_ip4_multicast_rport_del(p); - del_timer(&p->ip4_mc_router_timer); - del |= br_ip6_multicast_rport_del(p); + pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&p->ip6_mc_router_timer); + del_timer(&pmctx->ip6_mc_router_timer); #endif - br_multicast_rport_del_notify(p, del); + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - del |= br_ip4_multicast_rport_del(p); - del |= br_ip6_multicast_rport_del(p); - br_multicast_rport_del_notify(p, del); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + del |= br_ip4_multicast_rport_del(pmctx); + del |= br_ip6_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: - p->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&p->ip4_mc_router_timer); - br_ip4_multicast_add_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_PERM; + del_timer(&pmctx->ip4_mc_router_timer); + br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&p->ip6_mc_router_timer); + del_timer(&pmctx->ip6_mc_router_timer); #endif - br_ip6_multicast_add_router(br, p); + br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: - p->multicast_router = MDB_RTR_TYPE_TEMP; - br_ip4_multicast_mark_router(br, p); - br_ip6_multicast_mark_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP; + br_ip4_multicast_mark_router(brmctx, pmctx); + br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: - spin_unlock(&br->multicast_lock); + spin_unlock(&p->br->multicast_lock); return err; } -static void br_multicast_start_querier(struct net_bridge *br, +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; - __br_multicast_open(br, query); + __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); - list_for_each_entry_rcu(port, &br->port_list, list) { - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { + struct bridge_mcast_own_query *ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query *ip6_own_query; +#endif + + if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; - if (query == &br->ip4_own_query) - br_multicast_enable(&port->ip4_own_query); + if (br_multicast_ctx_is_vlan(brmctx)) { + struct net_bridge_vlan *vlan; + + vlan = br_vlan_find(nbp_vlan_group(port), brmctx->vlan->vid); + if (!vlan || + br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) + continue; + + ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; +#endif + } else { + ip4_own_query = &port->multicast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &port->multicast_ctx.ip6_own_query; +#endif + } + + if (query == &brmctx->ip4_own_query) + br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); @@ -3796,7 +4243,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) - __br_multicast_enable_port(port); + __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; @@ -3839,7 +4286,7 @@ bool br_multicast_router(const struct net_device *dev) bool is_router; spin_lock_bh(&br->multicast_lock); - is_router = br_multicast_is_router(br, NULL); + is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } @@ -3847,6 +4294,7 @@ EXPORT_SYMBOL_GPL(br_multicast_router); int br_multicast_set_querier(struct net_bridge *br, unsigned long val) { + struct net_bridge_mcast *brmctx = &br->multicast_ctx; unsigned long max_delay; val = !!val; @@ -3859,18 +4307,18 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val) if (!val) goto unlock; - max_delay = br->multicast_query_response_interval; + max_delay = brmctx->multicast_query_response_interval; - if (!timer_pending(&br->ip4_other_query.timer)) - br->ip4_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip4_other_query.timer)) + brmctx->ip4_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip4_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - if (!timer_pending(&br->ip6_other_query.timer)) - br->ip6_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip6_other_query.timer)) + brmctx->ip6_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip6_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: @@ -3891,7 +4339,7 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) } spin_lock_bh(&br->multicast_lock); - br->multicast_igmp_version = val; + br->multicast_ctx.multicast_igmp_version = val; spin_unlock_bh(&br->multicast_lock); return 0; @@ -3910,7 +4358,7 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) } spin_lock_bh(&br->multicast_lock); - br->multicast_mld_version = val; + br->multicast_ctx.multicast_mld_version = val; spin_unlock_bh(&br->multicast_lock); return 0; @@ -4003,7 +4451,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); - ret = br_multicast_querier_exists(br, ð, NULL); + ret = br_multicast_querier_exists(&br->multicast_ctx, ð, NULL); unlock: rcu_read_unlock(); @@ -4022,6 +4470,7 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { + struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; @@ -4035,17 +4484,18 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) goto unlock; br = port->br; + brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: - if (!timer_pending(&br->ip4_other_query.timer) || - rcu_dereference(br->ip4_querier.port) == port) + if (!timer_pending(&brmctx->ip4_other_query.timer) || + rcu_dereference(brmctx->ip4_querier.port) == port) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: - if (!timer_pending(&br->ip6_other_query.timer) || - rcu_dereference(br->ip6_querier.port) == port) + if (!timer_pending(&brmctx->ip6_other_query.timer) || + rcu_dereference(brmctx->ip6_querier.port) == port) goto unlock; break; #endif @@ -4071,7 +4521,9 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); */ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) { - struct net_bridge_port *port, *p; + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; + struct net_bridge_port *port; bool ret = false; rcu_read_lock(); @@ -4079,11 +4531,12 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) if (!port) goto unlock; + brmctx = &port->br->multicast_ctx; switch (proto) { case ETH_P_IP: - hlist_for_each_entry_rcu(p, &port->br->ip4_mc_router_list, + hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) { - if (p == port) + if (pmctx->port == port) continue; ret = true; @@ -4092,9 +4545,9 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: - hlist_for_each_entry_rcu(p, &port->br->ip6_mc_router_list, + hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) { - if (p == port) + if (pmctx->port == port) continue; ret = true; @@ -4186,7 +4639,8 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, u64_stats_update_end(&pstats->syncp); } -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index 13290a749d09..f91c071d1608 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -33,7 +33,8 @@ static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr); -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -388,7 +389,8 @@ static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, } } -static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, +static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, int filter_mode) { @@ -405,14 +407,15 @@ static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); break; case MCAST_EXCLUDE: - br_multicast_create_eht_set_entry(pg, &zero_addr, h_addr, - MCAST_EXCLUDE, + br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr, + h_addr, MCAST_EXCLUDE, true); break; } } -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -441,8 +444,8 @@ static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, if (!set_h) goto fail_set_entry; - mod_timer(&set_h->timer, jiffies + br_multicast_gmi(br)); - mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx)); + mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx)); return; @@ -499,7 +502,8 @@ static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, } /* create new set entries from reports */ -static void __eht_create_set_entries(struct net_bridge_port_group *pg, +static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -512,8 +516,8 @@ static void __eht_create_set_entries(struct net_bridge_port_group *pg, memset(&eht_src_addr, 0, sizeof(eht_src_addr)); for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - filter_mode, + br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr, + h_addr, filter_mode, false); } } @@ -549,7 +553,8 @@ static bool __eht_del_set_entries(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, +static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -559,8 +564,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, - MCAST_INCLUDE); + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, MCAST_INCLUDE); break; case MCAST_EXCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, @@ -571,7 +576,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_block(struct net_bridge_port_group *pg, +static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -585,7 +591,7 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, addr_size); break; case MCAST_EXCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE); break; } @@ -594,7 +600,8 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, } /* flush_entries is true when changing mode */ -static bool __eht_inc_exc(struct net_bridge_port_group *pg, +static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -612,11 +619,10 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_eht_set *eht_set; struct net_bridge_group_src *src_ent; struct hlist_node *tmp; @@ -647,14 +653,15 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, &eht_src_addr); if (!eht_set) continue; - mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(br)); + mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx)); } } return changed; } -static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -663,14 +670,15 @@ static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_INCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE); return changed; } -static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -679,14 +687,15 @@ static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_EXCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE); return changed; } -static bool __eht_ip4_handle(struct net_bridge_port_group *pg, +static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -696,24 +705,25 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, switch (grec_type) { case IGMPV3_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, sizeof(__be32)); + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, + sizeof(__be32)); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; case IGMPV3_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; } @@ -722,7 +732,8 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, } #if IS_ENABLED(CONFIG_IPV6) -static bool __eht_ip6_handle(struct net_bridge_port_group *pg, +static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -732,18 +743,18 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, switch (grec_type) { case MLD2_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -751,7 +762,7 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, to_report = true; fallthrough; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -762,7 +773,8 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, #endif /* true means an entry was deleted */ -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, @@ -779,12 +791,12 @@ bool br_multicast_eht_handle(struct net_bridge_port_group *pg, memset(&eht_host_addr, 0, sizeof(eht_host_addr)); memcpy(&eht_host_addr, h_addr, addr_size); if (addr_size == sizeof(__be32)) - changed = __eht_ip4_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #if IS_ENABLED(CONFIG_IPV6) else - changed = __eht_ip6_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #endif out: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8642e56059fb..616a1b6dec3c 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -287,7 +287,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, - p->multicast_router) || + p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, @@ -1324,49 +1324,49 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { @@ -1566,7 +1566,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, + br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, @@ -1578,38 +1579,38 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, - br->multicast_last_member_count) || + br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, - br->multicast_startup_query_count) || + br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, - br->multicast_igmp_version)) + br->multicast_ctx.multicast_igmp_version)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, - br->multicast_mld_version)) + br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif - clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_membership_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_querier_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2b48b204205e..c939631428b9 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,6 +29,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 +#define BR_HWDOM_MAX BITS_PER_LONG + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -89,6 +91,59 @@ struct bridge_mcast_stats { }; #endif +/* net_bridge_mcast_port must be always defined due to forwarding stubs */ +struct net_bridge_mcast_port { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge_port *port; + struct net_bridge_vlan *vlan; + + struct bridge_mcast_own_query ip4_own_query; + struct timer_list ip4_mc_router_timer; + struct hlist_node ip4_rlist; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query ip6_own_query; + struct timer_list ip6_mc_router_timer; + struct hlist_node ip6_rlist; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + unsigned char multicast_router; +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + +/* net_bridge_mcast must be always defined due to forwarding stubs */ +struct net_bridge_mcast { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge *br; + struct net_bridge_vlan *vlan; + + u32 multicast_last_member_count; + u32 multicast_startup_query_count; + + u8 multicast_igmp_version; + u8 multicast_router; +#if IS_ENABLED(CONFIG_IPV6) + u8 multicast_mld_version; +#endif + unsigned long multicast_last_member_interval; + unsigned long multicast_membership_interval; + unsigned long multicast_querier_interval; + unsigned long multicast_query_interval; + unsigned long multicast_query_response_interval; + unsigned long multicast_startup_query_interval; + struct hlist_head ip4_mc_router_list; + struct timer_list ip4_mc_router_timer; + struct bridge_mcast_other_query ip4_other_query; + struct bridge_mcast_own_query ip4_own_query; + struct bridge_mcast_querier ip4_querier; +#if IS_ENABLED(CONFIG_IPV6) + struct hlist_head ip6_mc_router_list; + struct timer_list ip6_mc_router_timer; + struct bridge_mcast_other_query ip6_other_query; + struct bridge_mcast_own_query ip6_own_query; + struct bridge_mcast_querier ip6_querier; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst __rcu *tunnel_dst; @@ -98,6 +153,8 @@ struct br_tunnel_info { enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), + BR_VLFLAG_MCAST_ENABLED = BIT(2), + BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3), }; /** @@ -114,6 +171,9 @@ enum { * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry + * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context + * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast + * context * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * @@ -141,6 +201,11 @@ struct net_bridge_vlan { struct br_tunnel_info tinfo; + union { + struct net_bridge_mcast br_mcast_ctx; + struct net_bridge_mcast_port port_mcast_ctx; + }; + struct list_head vlist; struct rcu_head rcu; @@ -305,19 +370,13 @@ struct net_bridge_port { struct kobject kobj; struct rcu_head rcu; + struct net_bridge_mcast_port multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - struct bridge_mcast_own_query ip4_own_query; - struct timer_list ip4_mc_router_timer; - struct hlist_node ip4_rlist; -#if IS_ENABLED(CONFIG_IPV6) - struct bridge_mcast_own_query ip6_own_query; - struct timer_list ip6_mc_router_timer; - struct hlist_node ip6_rlist; -#endif /* IS_ENABLED(CONFIG_IPV6) */ + struct bridge_mcast_stats __percpu *mcast_stats; + u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; - unsigned char multicast_router; - struct bridge_mcast_stats __percpu *mcast_stats; struct hlist_head mglist; #endif @@ -329,7 +388,12 @@ struct net_bridge_port { struct netpoll *np; #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Identifier used to group ports that share the same switchdev + * hardware domain. + */ + int hwdom; + int offload_count; + struct netdev_phys_item_id ppid; #endif u16 group_fwd_mask; u16 backup_redirected_cnt; @@ -376,6 +440,7 @@ enum net_bridge_opts { BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, + BROPT_MCAST_VLAN_SNOOPING_ENABLED, }; struct net_bridge { @@ -426,25 +491,14 @@ struct net_bridge { BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; + struct net_bridge_mcast multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct bridge_mcast_stats __percpu *mcast_stats; u32 hash_max; - u32 multicast_last_member_count; - u32 multicast_startup_query_count; - - u8 multicast_igmp_version; - u8 multicast_router; -#if IS_ENABLED(CONFIG_IPV6) - u8 multicast_mld_version; -#endif spinlock_t multicast_lock; - unsigned long multicast_last_member_interval; - unsigned long multicast_membership_interval; - unsigned long multicast_querier_interval; - unsigned long multicast_query_interval; - unsigned long multicast_query_response_interval; - unsigned long multicast_startup_query_interval; struct rhashtable mdb_hash_tbl; struct rhashtable sg_port_tbl; @@ -452,19 +506,6 @@ struct net_bridge { struct hlist_head mcast_gc_list; struct hlist_head mdb_list; - struct hlist_head ip4_mc_router_list; - struct timer_list ip4_mc_router_timer; - struct bridge_mcast_other_query ip4_other_query; - struct bridge_mcast_own_query ip4_own_query; - struct bridge_mcast_querier ip4_querier; - struct bridge_mcast_stats __percpu *mcast_stats; -#if IS_ENABLED(CONFIG_IPV6) - struct hlist_head ip6_mc_router_list; - struct timer_list ip6_mc_router_timer; - struct bridge_mcast_other_query ip6_other_query; - struct bridge_mcast_own_query ip6_own_query; - struct bridge_mcast_querier ip6_querier; -#endif /* IS_ENABLED(CONFIG_IPV6) */ struct work_struct mcast_gc_work; #endif @@ -476,7 +517,12 @@ struct net_bridge { u32 auto_cnt; #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Counter used to make sure that hardware domains get unique + * identifiers in case a bridge spans multiple switchdev instances. + */ + int last_hwdom; + /* Bit mask of hardware domain numbers in use */ + unsigned long busy_hwdoms; #endif struct hlist_head fdb_list; @@ -506,7 +552,20 @@ struct br_input_skb_cb { #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Set if TX data plane offloading is used towards at least one + * hardware domain. + */ + u8 tx_fwd_offload:1; + /* The switchdev hardware domain from which this packet was received. + * If skb->offload_fwd_mark was set, then this packet was already + * forwarded by hardware to the other ports in the source hardware + * domain, otherwise it wasn't. + */ + int src_hwdom; + /* Bit mask of hardware domains towards this packet has already been + * transmitted using the TX data plane offload. + */ + unsigned long fwd_hwdoms; #endif }; @@ -718,6 +777,8 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded); +int br_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, + struct notifier_block *nb); /* br_forward.c */ enum br_pkt_type { @@ -790,15 +851,18 @@ br_port_get_check_rtnl(const struct net_device *dev) } /* br_ioctl.c */ -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, - void __user *arg); +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, + void __user *data, int cmd); +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid); -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid); int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); @@ -810,8 +874,9 @@ void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); -void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, bool local_rcv, bool local_orig); +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, + bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge *br, unsigned long val); int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val, @@ -835,12 +900,13 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp); -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_uninit_stats(struct net_bridge *br); @@ -849,7 +915,8 @@ void br_multicast_get_stats(const struct net_bridge *br, struct br_mcast_stats *dest); void br_mdb_init(void); void br_mdb_uninit(void); -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); @@ -859,6 +926,23 @@ struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx); +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx); +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx); +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); +void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on); +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on); + +int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack); static inline bool br_group_is_l2(const struct br_ip *group) { @@ -869,52 +953,65 @@ static inline bool br_group_is_l2(const struct br_ip *group) rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) static inline struct hlist_node * -br_multicast_get_first_rport_node(struct net_bridge *b, struct sk_buff *skb) { +br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx, + struct sk_buff *skb) +{ #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) - return rcu_dereference(hlist_first_rcu(&b->ip6_mc_router_list)); + return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list)); #endif - return rcu_dereference(hlist_first_rcu(&b->ip4_mc_router_list)); + return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list)); } static inline struct net_bridge_port * -br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { +br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) +{ + struct net_bridge_mcast_port *mctx; + #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) - return hlist_entry_safe(rp, struct net_bridge_port, ip6_rlist); + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip6_rlist); + else #endif - return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip4_rlist); + + if (mctx) + return mctx->port; + else + return NULL; } -static inline bool br_ip4_multicast_is_router(struct net_bridge *br) +static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx) { - return timer_pending(&br->ip4_mc_router_timer); + return timer_pending(&brmctx->ip4_mc_router_timer); } -static inline bool br_ip6_multicast_is_router(struct net_bridge *br) +static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) - return timer_pending(&br->ip6_mc_router_timer); + return timer_pending(&brmctx->ip6_mc_router_timer); #else return false; #endif } static inline bool -br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) +br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { - switch (br->multicast_router) { + switch (brmctx->multicast_router) { case MDB_RTR_TYPE_PERM: return true; case MDB_RTR_TYPE_TEMP_QUERY: if (skb) { if (skb->protocol == htons(ETH_P_IP)) - return br_ip4_multicast_is_router(br); + return br_ip4_multicast_is_router(brmctx); else if (skb->protocol == htons(ETH_P_IPV6)) - return br_ip6_multicast_is_router(br); + return br_ip6_multicast_is_router(brmctx); } else { - return br_ip4_multicast_is_router(br) || - br_ip6_multicast_is_router(br); + return br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx); } fallthrough; default: @@ -923,14 +1020,14 @@ br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) } static inline bool -__br_multicast_querier_exists(struct net_bridge *br, - struct bridge_mcast_other_query *querier, - const bool is_ipv6) +__br_multicast_querier_exists(struct net_bridge_mcast *brmctx, + struct bridge_mcast_other_query *querier, + const bool is_ipv6) { bool own_querier_enabled; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR)) + if (br_opt_get(brmctx->br, BROPT_MULTICAST_QUERIER)) { + if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; @@ -942,18 +1039,18 @@ __br_multicast_querier_exists(struct net_bridge *br, (own_querier_enabled || timer_pending(&querier->timer)); } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): - return __br_multicast_querier_exists(br, - &br->ip4_other_query, false); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip4_other_query, false); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): - return __br_multicast_querier_exists(br, - &br->ip6_other_query, true); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip6_other_query, true); #endif default: return !!mdb && br_group_is_l2(&mdb->addr); @@ -974,15 +1071,16 @@ static inline bool br_multicast_is_star_g(const struct br_ip *ip) } } -static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, - __be16 proto) +static inline bool +br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx, + __be16 proto) { switch (proto) { case htons(ETH_P_IP): - return !!(br->multicast_igmp_version == 3); + return !!(brmctx->multicast_igmp_version == 3); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return !!(br->multicast_mld_version == 2); + return !!(brmctx->multicast_mld_version == 2); #endif default: return false; @@ -994,28 +1092,90 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) return BR_INPUT_SKB_CB(skb)->igmp; } -static inline unsigned long br_multicast_lmqt(const struct net_bridge *br) +static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx) { - return br->multicast_last_member_interval * - br->multicast_last_member_count; + return brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count; } -static inline unsigned long br_multicast_gmi(const struct net_bridge *br) +static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { /* use the RFC default of 2 for QRV */ - return 2 * br->multicast_query_interval + - br->multicast_query_response_interval; + return 2 * brmctx->multicast_query_interval + + brmctx->multicast_query_response_interval; +} + +static inline bool +br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx) +{ + return !!brmctx->vlan; +} + +static inline bool +br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx) +{ + return !!pmctx->vlan; +} + +static inline struct net_bridge_mcast * +br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx) +{ + if (!br_multicast_port_ctx_is_vlan(pmctx)) + return &pmctx->port->br->multicast_ctx; + else + return &pmctx->vlan->brvlan->br_mcast_ctx; +} + +static inline bool +br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED); +} + +static inline bool +br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return pmctx->port->state == BR_STATE_DISABLED || + (br_multicast_port_ctx_is_vlan(pmctx) && + (br_multicast_port_ctx_vlan_disabled(pmctx) || + pmctx->vlan->state == BR_STATE_DISABLED)); +} + +static inline bool +br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_port_ctx_state_disabled(pmctx) || + pmctx->port->state == BR_STATE_BLOCKING || + (br_multicast_port_ctx_is_vlan(pmctx) && + pmctx->vlan->state == BR_STATE_BLOCKING); } #else -static inline int br_multicast_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { return 0; } -static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { return NULL; @@ -1064,17 +1224,18 @@ static inline void br_multicast_dev_del(struct net_bridge *br) static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { } -static inline bool br_multicast_is_router(struct net_bridge *br, +static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { return false; } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { @@ -1118,13 +1279,65 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return 0; } + +static inline void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, + bool on) +{ +} + +static inline void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, + bool on) +{ +} + +static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br, + bool on, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, + bool on) +{ + return false; +} + +static inline int br_mdb_replay(struct net_device *br_dev, + struct net_device *dev, const void *ctx, + bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} #endif /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state); + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -1168,6 +1381,9 @@ void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd); +int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack); bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); @@ -1236,8 +1452,11 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) + { + *vlan = NULL; return true; } @@ -1410,6 +1629,14 @@ static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, { return true; } + +static inline int br_vlan_replay(struct net_device *br_dev, + struct net_device *dev, const void *ctx, + bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} #endif /* br_vlan_options.c */ @@ -1424,6 +1651,14 @@ int br_vlan_process_options(const struct net_bridge *br, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack); +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end); +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts); /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) @@ -1645,7 +1880,14 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; } /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV -int nbp_switchdev_mark_set(struct net_bridge_port *p); +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); + +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); + +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb); +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, @@ -1659,15 +1901,32 @@ void br_switchdev_fdb_notify(struct net_bridge *br, int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); +void br_switchdev_init(struct net_bridge *br); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { skb->offload_fwd_mark = 0; } #else -static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) +{ + return false; +} + +static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) { - return 0; } static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, @@ -1710,6 +1969,11 @@ br_switchdev_fdb_notify(struct net_bridge *br, static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { } + +static inline void br_switchdev_init(struct net_bridge *br) +{ +} + #endif /* CONFIG_NET_SWITCHDEV */ /* br_arp_nd_proxy.c */ diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h index f89049f4892c..adf82a05515a 100644 --- a/net/bridge/br_private_mcast_eht.h +++ b/net/bridge/br_private_mcast_eht.h @@ -51,7 +51,8 @@ struct net_bridge_group_eht_set { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg); -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index d3adee0f91f9..023de0e958f1 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -8,50 +8,65 @@ #include "br_private.h" -static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) -{ - struct net_bridge_port *p; +static struct static_key_false br_switchdev_tx_fwd_offload; - /* dev is yet to be added to the port list. */ - list_for_each_entry(p, &br->port_list, list) { - if (netdev_port_same_parent_id(dev, p->dev)) - return p->offload_fwd_mark; - } +static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - return ++br->offload_fwd_mark; + return (p->flags & BR_TX_FWD_OFFLOAD) && + (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } -int nbp_switchdev_mark_set(struct net_bridge_port *p) +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { - struct netdev_phys_item_id ppid = { }; - int err; + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - ASSERT_RTNL(); + return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; +} - err = dev_get_port_parent_id(p->dev, &ppid, true); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ + skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); +} - p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); +/* Mark the frame for TX forwarding offload if this egress port supports it */ +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; +} - return 0; +/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms + * that the skb has been already forwarded to, to avoid further cloning to + * other ports in the same hwdom by making nbp_switchdev_allowed_egress() + * return false. + */ +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { - if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) - BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; + if (p->hwdom) + BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { - return !skb->offload_fwd_mark || - BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; + struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); + + return !test_bit(p->hwdom, &cb->fwd_hwdoms) && + (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ @@ -112,7 +127,6 @@ br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { const struct net_bridge_port *dst = READ_ONCE(fdb->dst); - struct net_device *dev = dst ? dst->dev : br->dev; struct switchdev_notifier_fdb_info info = { .addr = fdb->key.addr.addr, .vid = fdb->key.vlan_id, @@ -120,6 +134,7 @@ br_switchdev_fdb_notify(struct net_bridge *br, .is_local = test_bit(BR_FDB_LOCAL, &fdb->flags), .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), }; + struct net_device *dev = info.is_local ? br->dev : dst->dev; switch (type) { case RTM_DELNEIGH: @@ -156,3 +171,200 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) return switchdev_port_obj_del(dev, &v.obj); } + +static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) +{ + struct net_bridge *br = joining->br; + struct net_bridge_port *p; + int hwdom; + + /* joining is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { + joining->hwdom = p->hwdom; + return 0; + } + } + + hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); + if (hwdom >= BR_HWDOM_MAX) + return -EBUSY; + + set_bit(hwdom, &br->busy_hwdoms); + joining->hwdom = hwdom; + return 0; +} + +static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) +{ + struct net_bridge *br = leaving->br; + struct net_bridge_port *p; + + /* leaving is no longer in the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (p->hwdom == leaving->hwdom) + return; + } + + clear_bit(leaving->hwdom, &br->busy_hwdoms); +} + +static int nbp_switchdev_add(struct net_bridge_port *p, + struct netdev_phys_item_id ppid, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + int err; + + if (p->offload_count) { + /* Prevent unsupported configurations such as a bridge port + * which is a bonding interface, and the member ports are from + * different hardware switches. + */ + if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { + NL_SET_ERR_MSG_MOD(extack, + "Same bridge port cannot be offloaded by two physical switches"); + return -EBUSY; + } + + /* Tolerate drivers that call switchdev_bridge_port_offload() + * more than once for the same bridge port, such as when the + * bridge port is an offloaded bonding/team interface. + */ + p->offload_count++; + + return 0; + } + + p->ppid = ppid; + p->offload_count = 1; + + err = nbp_switchdev_hwdom_set(p); + if (err) + return err; + + if (tx_fwd_offload) { + p->flags |= BR_TX_FWD_OFFLOAD; + static_branch_inc(&br_switchdev_tx_fwd_offload); + } + + return 0; +} + +static void nbp_switchdev_del(struct net_bridge_port *p) +{ + if (WARN_ON(!p->offload_count)) + return; + + p->offload_count--; + + if (p->offload_count) + return; + + if (p->hwdom) + nbp_switchdev_hwdom_put(p); + + if (p->flags & BR_TX_FWD_OFFLOAD) { + p->flags &= ~BR_TX_FWD_OFFLOAD; + static_branch_dec(&br_switchdev_tx_fwd_offload); + } +} + +static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + int err; + + err = br_vlan_replay(br_dev, dev, ctx, true, blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_fdb_replay(br_dev, ctx, true, atomic_nb); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + + br_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_fdb_replay(br_dev, ctx, false, atomic_nb); +} + +/* Let the bridge know that this port is offloaded, so that it can assign a + * switchdev hardware domain to it. + */ +int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + struct netdev_phys_item_id ppid; + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(brport_dev); + if (!p) + return -ENODEV; + + err = dev_get_port_parent_id(dev, &ppid, false); + if (err) + return err; + + err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); + if (err) + return err; + + err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); + if (err) + goto out_switchdev_del; + + return 0; + +out_switchdev_del: + nbp_switchdev_del(p); + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload); + +void switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(brport_dev); + if (!p) + return; + + nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); + + nbp_switchdev_del(p); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 381467b691d5..953d544663d5 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -384,7 +384,7 @@ static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_router); + return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, @@ -514,7 +514,7 @@ static ssize_t multicast_igmp_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_igmp_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, @@ -536,13 +536,13 @@ static ssize_t multicast_last_member_count_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_last_member_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; return 0; } @@ -558,13 +558,13 @@ static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_startup_query_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; return 0; } @@ -581,13 +581,13 @@ static ssize_t multicast_last_member_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_last_member_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } @@ -604,13 +604,13 @@ static ssize_t multicast_membership_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_membership_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } @@ -628,13 +628,13 @@ static ssize_t multicast_querier_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_querier_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } @@ -652,13 +652,13 @@ static ssize_t multicast_query_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); return 0; } @@ -676,13 +676,13 @@ static ssize_t multicast_query_response_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_response_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } @@ -700,13 +700,13 @@ static ssize_t multicast_startup_query_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_startup_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); return 0; } @@ -751,7 +751,7 @@ static ssize_t multicast_mld_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_mld_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 72e92376eef1..e9e3aedd3178 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -244,7 +244,7 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->multicast_router); + return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index a08e9f193009..8cfd035bbaf9 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -190,6 +190,8 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); + br_multicast_toggle_one_vlan(masterv, false); + br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } @@ -280,10 +282,13 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } else { v->stats = masterv->stats; } + br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err && err != -EOPNOTSUPP) goto out; + br_multicast_ctx_init(br, v, &v->br_mcast_ctx); + v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ @@ -306,6 +311,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, __vlan_add_list(v); __vlan_add_flags(v, flags); + br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); @@ -374,6 +380,8 @@ static int __vlan_del(struct net_bridge_vlan *v) br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); + br_multicast_toggle_one_vlan(v, false); + br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } @@ -457,7 +465,15 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, u64_stats_update_end(&stats->syncp); } - if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) + /* If the skb will be sent using forwarding offload, the assumption is + * that the switchdev will inject the packet into hardware together + * with the bridge VLAN, so that it can be forwarded according to that + * VLAN. The switchdev should deal with popping the VLAN header in + * hardware on each egress port as appropriate. So only strip the VLAN + * header if forwarding offload is not being used. + */ + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && + !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && @@ -473,7 +489,8 @@ out: static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, - u8 *state) + u8 *state, + struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; @@ -538,8 +555,9 @@ static bool __allowed_ingress(const struct net_bridge *br, */ skb->vlan_tci |= pvid; - /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + /* if snooping and stats are disabled we can avoid the lookup */ + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); return br_vlan_state_allowed(*state, true); @@ -566,6 +584,8 @@ static bool __allowed_ingress(const struct net_bridge *br, u64_stats_update_end(&stats->syncp); } + *vlan = v; + return true; drop: @@ -575,17 +595,19 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ + *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } - return __allowed_ingress(br, vg, skb, vid, state); + return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ @@ -818,14 +840,21 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; + br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); + err = switchdev_port_attr_set(br->dev, &attr, extack); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; + } - br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); + if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); + br_multicast_toggle_vlan_snooping(br, false, NULL); + } return 0; } @@ -1420,6 +1449,33 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, } EXPORT_SYMBOL_GPL(br_vlan_get_info); +int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + + p = br_port_get_check_rcu(dev); + if (p) + vg = nbp_vlan_group_rcu(p); + else if (netif_is_bridge_master(dev)) + vg = br_vlan_group_rcu(netdev_priv(dev)); + else + return -EINVAL; + + v = br_vlan_find(vg, vid); + if (!v) + return -ENOENT; + + p_vinfo->vid = vid; + p_vinfo->flags = v->flags; + if (vid == br_get_pvid(vg)) + p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); + static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && @@ -1838,6 +1894,9 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, ASSERT_RTNL(); + if (!nb) + return 0; + if (!netif_is_bridge_master(br_dev)) return -EINVAL; @@ -1884,7 +1943,6 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(br_vlan_replay); /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, @@ -1901,6 +1959,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; @@ -1919,6 +1978,10 @@ static int br_vlan_dump_dev(const struct net_device *dev, vg = br_vlan_group_rcu(br); p = NULL; } else { + /* global options are dumped only for bridge devices */ + if (dump_global) + return 0; + p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; @@ -1941,7 +2004,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { - if (!br_vlan_should_use(v)) + if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; @@ -1954,8 +2017,21 @@ static int br_vlan_dump_dev(const struct net_device *dev, continue; } - if (dump_stats || v->vid == pvid || - !br_vlan_can_enter_range(v, range_end)) { + if (dump_global) { + if (br_vlan_global_opts_can_enter_range(v, range_end)) + continue; + if (!br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, + range_start)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } else if (dump_stats || v->vid == pvid || + !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, @@ -1977,11 +2053,18 @@ static int br_vlan_dump_dev(const struct net_device *dev, * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ - if (!err && range_start && - !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, - range_start, br_vlan_flags(range_start, pvid), - dump_stats)) - err = -EMSGSIZE; + if (!err && range_start) { + if (dump_global && + !br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, range_start)) + err = -EMSGSIZE; + else if (!dump_global && + !br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, range_start, + br_vlan_flags(range_start, pvid), + dump_stats)) + err = -EMSGSIZE; + } cb->args[1] = err ? idx : 0; @@ -2185,12 +2268,22 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { - if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + switch (nla_type(attr)) { + case BRIDGE_VLANDB_ENTRY: + err = br_vlan_rtm_process_one(dev, attr, + nlh->nlmsg_type, + extack); + break; + case BRIDGE_VLANDB_GLOBAL_OPTIONS: + err = br_vlan_rtm_process_global_options(dev, attr, + nlh->nlmsg_type, + extack); + break; + default: continue; + } vlans++; - err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, - extack); if (err) break; } diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index b4add9ea8964..4ef975b20185 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -258,3 +258,219 @@ int br_vlan_process_options(const struct net_bridge *br, return err; } + +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end) +{ + return v_curr->vid - r_end->vid == 1 && + ((v_curr->priv_flags ^ r_end->priv_flags) & + BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0; +} + +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS); + if (!nest) + return false; + + if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid)) + goto out_err; + + if (vid_range && vid < vid_range && + nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range)) + goto out_err; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, + !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))) + goto out_err; +#endif + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static size_t rtnl_vlan_global_opts_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */ +#endif + + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ +} + +static void br_vlan_global_opts_notify(const struct net_bridge *br, + u16 vid, u16 vid_range) +{ + struct net_bridge_vlan *v; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = br->dev->ifindex; + + /* need to find the vlan due to flags/options */ + v = br_vlan_find(br_vlan_group(br), vid); + if (!v) + goto out_kfree; + + if (!br_vlan_global_opts_fill(skb, vid, vid_range, v)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err); +out_kfree: + kfree_skb(skb); +} + +static int br_vlan_process_global_one_opts(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + *changed = false; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) { + u8 mc_snooping; + + mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]); + if (br_multicast_toggle_global_vlan(v, !!mc_snooping)) + *changed = true; + } +#endif + + return 0; +} + +static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { + [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 }, +}; + +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1]; + struct net_bridge_vlan_group *vg; + u16 vid, vid_range = 0; + struct net_bridge *br; + int err = 0; + + if (cmd != RTM_NEWVLAN) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation"); + return -EINVAL; + } + if (!netif_is_bridge_master(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device"); + return -EINVAL; + } + br = netdev_priv(dev); + vg = br_vlan_group(br); + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr, + br_vlan_db_gpol, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_GOPTS_ID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id"); + return -EINVAL; + } + vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]); + if (!br_vlan_valid_id(vid, extack)) + return -EINVAL; + + if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) { + vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]); + if (!br_vlan_valid_id(vid_range, extack)) + return -EINVAL; + if (vid >= vid_range) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return -EINVAL; + } + } else { + vid_range = vid; + } + + for (; vid <= vid_range; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (!br_vlan_global_opts_can_enter_range(v, curr_end)) { + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); + + return err; +} diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 54f6d521492f..b904c06ab0cf 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -352,7 +352,7 @@ static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); - /* This function will be call by the generic networking code, when then + /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index bdc95bd7a851..dac70cdd3f41 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -776,7 +776,7 @@ static int j1939_session_tx_dpo(struct j1939_session *session) static int j1939_session_tx_dat(struct j1939_session *session) { struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *se_skcb; int offset, pkt_done, pkt_end; unsigned int len, pdelay; struct sk_buff *se_skb; @@ -788,7 +788,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (!se_skb) return -ENOBUFS; - skcb = j1939_skb_to_cb(se_skb); + se_skcb = j1939_skb_to_cb(se_skb); tpdat = se_skb->data; ret = 0; pkt_done = 0; @@ -800,7 +800,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) while (session->pkt.tx < pkt_end) { dat[0] = session->pkt.tx - session->pkt.dpo + 1; - offset = (session->pkt.tx * 7) - skcb->offset; + offset = (session->pkt.tx * 7) - se_skcb->offset; len = se_skb->len - offset; if (len > 7) len = 7; @@ -808,7 +808,8 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (offset + len > se_skb->len) { netdev_err_once(priv->ndev, "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", - __func__, session, skcb->offset, se_skb->len , session->pkt.tx); + __func__, session, se_skcb->offset, + se_skb->len , session->pkt.tx); ret = -EOVERFLOW; goto out_free; } @@ -1097,7 +1098,7 @@ j1939_session_deactivate_activate_next(struct j1939_session *session) } static void __j1939_session_cancel(struct j1939_session *session, - enum j1939_xtp_abort err) + enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; @@ -1195,13 +1196,13 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) static void j1939_session_completed(struct j1939_session *session) { - struct sk_buff *skb; + struct sk_buff *se_skb; if (!session->transmission) { - skb = j1939_session_skb_get(session); + se_skb = j1939_session_skb_get(session); /* distribute among j1939 receivers */ - j1939_sk_recv(session->priv, skb); - consume_skb(skb); + j1939_sk_recv(session->priv, se_skb); + consume_skb(se_skb); } j1939_session_deactivate_activate_next(session); @@ -1268,12 +1269,14 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_RTS: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: abort = J1939_XTP_ABORT_BUSY; break; case J1939_ETP_CMD_CTS: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN; break; @@ -1282,7 +1285,8 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_EOMA: - case J1939_TP_CMD_EOMA: /* fall through */ + fallthrough; + case J1939_TP_CMD_EOMA: abort = J1939_XTP_ABORT_OTHER; break; @@ -1772,7 +1776,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; const u8 *dat; u8 *tpdat; @@ -1797,7 +1801,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, break; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: if (skcb->addr.type != J1939_ETP) break; fallthrough; @@ -1822,8 +1827,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, goto out_session_cancel; } - skcb = j1939_skb_to_cb(se_skb); - offset = packet * 7 - skcb->offset; + se_skcb = j1939_skb_to_cb(se_skb); + offset = packet * 7 - se_skcb->offset; nbytes = se_skb->len - offset; if (nbytes > 7) nbytes = 7; @@ -1851,7 +1856,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (packet == session->pkt.rx) session->pkt.rx++; - if (skcb->addr.type != J1939_ETP && + if (se_skcb->addr.type != J1939_ETP && j1939_cb_is_broadcast(&session->skcb)) { if (session->pkt.rx >= session->pkt.total) final = true; @@ -2000,7 +2005,8 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) return; diff --git a/net/core/Makefile b/net/core/Makefile index f7f16650fe9e..35ced6201814 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o -ifeq ($(CONFIG_INET),y) obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o -endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/dev.c b/net/core/dev.c index 8f1a47ad6781..64e1a5f63f93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4012,7 +4012,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -4756,45 +4756,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - u32 metalen, act = XDP_DROP; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; + u32 metalen, act; int off; - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_is_redirected(skb)) - return XDP_PASS; - - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. - */ - if (skb_cloned(skb) || skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) - goto do_drop; - } - /* The XDP program wants to see the packet starting at the MAC * header. */ @@ -4849,6 +4822,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, skb->dev); } + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -4859,6 +4839,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (metalen) skb_metadata_set(skb, metalen); break; + } + + return act; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (skb_linearize(skb)) + goto do_drop; + } + + act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; default: bpf_warn_invalid_xdp_action(act); fallthrough; @@ -5141,8 +5164,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, - &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -5324,7 +5346,6 @@ another_round: ret = NET_RX_DROP; goto out; } - skb_reset_mac_len(skb); } if (eth_type_vlan(skb->protocol)) { @@ -5650,25 +5671,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; - if (new) { - u32 i; - - mutex_lock(&new->aux->used_maps_mutex); - - /* generic XDP does not work with DEVMAPs that can - * have a bpf_prog installed on an entry - */ - for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i]) || - cpu_map_prog_allowed(new->aux->used_maps[i])) { - mutex_unlock(&new->aux->used_maps_mutex); - return -EINVAL; - } - } - - mutex_unlock(&new->aux->used_maps_mutex); - } - switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -6011,7 +6013,6 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), @@ -6021,17 +6022,30 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (!diffs) { - struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT); - struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT); + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; - } #endif + } NAPI_GRO_CB(p)->same_flow = !diffs; } @@ -6296,8 +6310,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - skb_ext_reset(skb); - nf_reset_ct(skb); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } napi->skb = skb; } @@ -10134,7 +10152,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -10201,7 +10219,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -10841,7 +10859,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 478d032f34ac..4035bce06bf8 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <linux/if_bridge.h> #include <net/dsa.h> #include <net/wext.h> @@ -25,79 +27,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr) return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); - /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ - -int dev_ifconf(struct net *net, struct ifconf *ifc, int size) +int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + void __user *pos; + size_t size; + int len, total = 0, done; - /* - * Fetch the caller's info block. - */ + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; - pos = ifc->ifc_buf; - len = ifc->ifc_len; + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; - /* - * Loop over the interfaces, and write an info block for each. - */ + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; - total = 0; + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } + + /* Loop over the interfaces, and write an info block for each. */ + rtnl_lock(); for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0, size); - else - done = gifconf_list[i](dev, pos + total, - len - total, size); - if (done < 0) - return -EFAULT; - total += done; - } + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_unlock(); + return -EFAULT; } + total += done; } + rtnl_unlock(); - /* - * All done. Write the updated control block back to the caller. - */ - ifc->ifc_len = total; + return put_user(total, &uifc->ifc_len); +} + +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; - /* - * Both BSD and Solaris return 0 here, so we do too. - */ return 0; } +static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); +} + /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ @@ -128,13 +159,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm break; case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; @@ -215,19 +240,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } -static int dev_do_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) +static int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; int err; - err = dsa_ndo_do_ioctl(dev, ifr, cmd); + err = dsa_ndo_eth_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) return err; - if (ops->ndo_do_ioctl) { + if (ops->ndo_eth_ioctl) { if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); + err = ops->ndo_eth_ioctl(dev, ifr, cmd); else err = -ENODEV; } @@ -235,10 +260,55 @@ static int dev_do_ioctl(struct net_device *dev, return err; } +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + if (netif_device_present(dev)) + return ops->ndo_siocbond(dev, ifr, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + if (netif_device_present(dev)) + return ops->ndo_siocdevprivate(dev, ifr, data, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + if (netif_device_present(dev)) + return ops->ndo_siocwandev(dev, ifs); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); @@ -275,12 +345,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + return dev_setifmap(dev, ifr); case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -307,6 +372,15 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!netif_device_present(dev)) + return -ENODEV; + return br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); + case SIOCSHWTSTAMP: err = net_hwtstamp_validate(ifr); if (err) @@ -317,23 +391,23 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) * Unknown or private ioctl */ default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) + return dev_siocdevprivate(dev, ifr, data, cmd); + + if (cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP) { + err = dev_eth_ioctl(dev, ifr, cmd); + } else if (cmd == SIOCBONDENSLAVE || cmd == SIOCBONDRELEASE || cmd == SIOCBONDSETHWADDR || cmd == SIOCBONDSLAVEINFOQUERY || cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCGHWTSTAMP || - cmd == SIOCWANDEV) { - err = dev_do_ioctl(dev, ifr, cmd); + cmd == SIOCBONDCHANGEACTIVE) { + err = dev_siocbond(dev, ifr, cmd); } else err = -EINVAL; @@ -386,7 +460,8 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) { int ret; char *colon; @@ -437,7 +512,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCETHTOOL: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ethtool(net, ifr); + ret = dev_ethtool(net, ifr, data); rtnl_unlock(); if (colon) *colon = ':'; @@ -456,7 +531,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (colon) *colon = ':'; @@ -502,7 +577,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (need_copyout) *need_copyout = false; @@ -527,7 +602,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); return ret; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 85032626de24..8fa015319af6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -108,19 +108,6 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); -static void __devlink_net_set(struct devlink *devlink, struct net *net) -{ - write_pnet(&devlink->_net, net); -} - -void devlink_net_set(struct devlink *devlink, struct net *net) -{ - if (WARN_ON(devlink->registered)) - return; - __devlink_net_set(devlink, net); -} -EXPORT_SYMBOL_GPL(devlink_net_set); - static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -1043,7 +1030,7 @@ static void devlink_port_notify(struct devlink_port *devlink_port, struct sk_buff *msg; int err; - if (!devlink_port->registered) + if (!devlink_port->devlink) return; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); @@ -3801,10 +3788,12 @@ static void devlink_param_notify(struct devlink *devlink, struct devlink_param_item *param_item, enum devlink_command cmd); -static void devlink_reload_netns_change(struct devlink *devlink, - struct net *dest_net) +static void devlink_ns_change_notify(struct devlink *devlink, + struct net *dest_net, struct net *curr_net, + bool new) { struct devlink_param_item *param_item; + enum devlink_command cmd; /* Userspace needs to be notified about devlink objects * removed from original and entering new network namespace. @@ -3812,17 +3801,18 @@ static void devlink_reload_netns_change(struct devlink *devlink, * reload process so the notifications are generated separatelly. */ - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - devlink_notify(devlink, DEVLINK_CMD_DEL); + if (!dest_net || net_eq(dest_net, curr_net)) + return; - __devlink_net_set(devlink, dest_net); + if (new) + devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_notify(devlink, DEVLINK_CMD_NEW); + cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, cmd); + + if (!new) + devlink_notify(devlink, DEVLINK_CMD_DEL); } static bool devlink_reload_supported(const struct devlink_ops *ops) @@ -3902,6 +3892,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; int err; if (!devlink->reload_enabled) @@ -3909,18 +3900,22 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); + + curr_net = devlink_net(devlink); + devlink_ns_change_notify(devlink, dest_net, curr_net, false); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; - if (dest_net && !net_eq(dest_net, devlink_net(devlink))) - devlink_reload_netns_change(devlink, dest_net); + if (dest_net && !net_eq(dest_net, curr_net)) + write_pnet(&devlink->_net, dest_net); err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; + devlink_ns_change_notify(devlink, dest_net, curr_net, true); WARN_ON(!(*actions_performed & BIT(action))); /* Catch driver on updating the remote action within devlink reload */ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, @@ -8768,15 +8763,18 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) } /** - * devlink_alloc - Allocate new devlink instance resources + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace * * @ops: ops * @priv_size: size of user private data + * @net: net namespace * * Allocate new devlink instance resources, including devlink index * and name. */ -struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net) { struct devlink *devlink; @@ -8791,7 +8789,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) return NULL; devlink->ops = ops; xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - __devlink_net_set(devlink, &init_net); + write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -8807,7 +8805,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) mutex_init(&devlink->reporters_lock); return devlink; } -EXPORT_SYMBOL_GPL(devlink_alloc); +EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_register - Register devlink instance @@ -8817,8 +8815,8 @@ EXPORT_SYMBOL_GPL(devlink_alloc); */ int devlink_register(struct devlink *devlink, struct device *dev) { + WARN_ON(devlink->dev); devlink->dev = dev; - devlink->registered = true; mutex_lock(&devlink_mutex); list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); @@ -8960,9 +8958,10 @@ int devlink_port_register(struct devlink *devlink, mutex_unlock(&devlink->lock); return -EEXIST; } + + WARN_ON(devlink_port->devlink); devlink_port->devlink = devlink; devlink_port->index = port_index; - devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); @@ -9001,7 +9000,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->registered)) + if (WARN_ON(!devlink_port->devlink)) return; devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); @@ -9121,7 +9120,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); @@ -9145,7 +9144,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); @@ -9172,7 +9171,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); @@ -9200,7 +9199,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a9f937975080..79df7cd9dbc1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -57,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; @@ -541,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index d70187ce851b..3b4986e96e9c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -77,6 +77,7 @@ #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> +#include <net/xdp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -3880,8 +3881,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; - if (unlikely((metalen & (sizeof(__u32) - 1)) || - (metalen > 32))) + if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; @@ -4040,8 +4040,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, goto err; consume_skb(skb); break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; default: - /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4b2415d34873..bac0184cf3de 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1056,8 +1056,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } @@ -1101,8 +1103,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &iph->saddr, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 8ec7d13d2860..d0ae987d2de9 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -43,6 +43,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f6af3e74fc44..e79aaf1f7139 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -710,15 +710,8 @@ out: int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; - int err = 0; - NETLINK_CB(skb).dst_group = group; - if (echo) - refcount_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) @@ -733,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; - int report = 0; - - if (nlh) - report = nlmsg_report(nlh); - nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); diff --git a/net/core/scm.c b/net/core/scm.c index ae3085d9aae8..5c356f0dee30 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -355,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); diff --git a/net/core/selftests.c b/net/core/selftests.c index ba7b0171974c..9077fa969892 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -318,6 +318,15 @@ static int net_test_phy_loopback_udp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + static int net_test_phy_loopback_tcp(struct net_device *ndev) { struct net_packet_attrs attr = { }; @@ -345,6 +354,9 @@ static const struct net_test { .name = "PHY internal loopback, UDP ", .fn = net_test_phy_loopback_udp, }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fc7942c0dddc..fcbd977186b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -954,9 +954,13 @@ void __kfree_skb_defer(struct sk_buff *skb) void napi_skb_free_stolen_head(struct sk_buff *skb) { - nf_reset_ct(skb); - skb_dst_drop(skb); - skb_ext_put(skb); + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } napi_skb_cache_put(skb); } @@ -3889,6 +3893,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->last = skb; NAPI_GRO_CB(p)->count++; p->data_len += skb->len; + + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; p->truesize += skb->truesize; p->len += skb->len; @@ -4256,6 +4263,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int new_truesize; struct sk_buff *lp; if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) @@ -4287,10 +4295,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ - delta_truesize = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; - skb->truesize -= skb->data_len; + skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; @@ -4319,12 +4327,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ - delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + new_truesize = SKB_TRUESIZE(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -6449,6 +6461,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) new->chunks = newlen; new->offset[id] = newoff; set_active: + skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); diff --git a/net/core/sock.c b/net/core/sock.c index a3eea6e0b30a..9671c32e6ef5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 60decd6420ca..ae5fa4338d9c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -211,8 +211,6 @@ out: return psock; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); @@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock *psock; int ret; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (!sock_map_redirect_allowed(sk)) - goto no_progs; - stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } -no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk) sk->sk_protocol == IPPROTO_TCP; } -static bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) @@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); - else if (sk_is_udp(sk)) - return sk_hashed(sk); - - return false; + return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, @@ -1536,6 +1517,7 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } +EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7eb0fb231940..abb5c596a817 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1126,7 +1126,7 @@ static int __init dccp_init(void) dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 77fbf8e9df4b..387a7e81dd00 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -102,7 +102,7 @@ void dn_fib_free_info(struct dn_fib_info *fi) void dn_fib_release_info(struct dn_fib_info *fi) { spin_lock(&dn_fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { if (fi->fib_next) fi->fib_next->fib_prev = fi->fib_prev; if (fi->fib_prev) @@ -385,11 +385,11 @@ link_it: if ((ofi = dn_fib_find_info(fi)) != NULL) { fi->fib_dead = 1; dn_fib_free_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_inc(&fi->fib_treeref); refcount_set(&fi->fib_clntref, 1); spin_lock(&dn_fib_info_lock); fi->fib_next = dn_fib_info_list; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 00bb89b2d86f..bca1b5d66df2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -18,16 +18,6 @@ if NET_DSA # Drivers must select the appropriate tagging format(s) -config NET_DSA_TAG_8021Q - tristate - select VLAN_8021Q - help - Unlike the other tagging protocols, the 802.1Q config option simply - provides helpers for other tagging implementations that might rely on - VLAN in one way or another. It is not a complete solution. - - Drivers which use these helpers should select this as dependency. - config NET_DSA_TAG_AR9331 tristate "Tag driver for Atheros AR9331 SoC with built-in switch" help @@ -126,7 +116,6 @@ config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" depends on MSCC_OCELOT_SWITCH_LIB || \ (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) - select NET_DSA_TAG_8021Q help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -149,7 +138,6 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - select NET_DSA_TAG_8021Q select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 44bc79952b8b..67ea009f242c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,10 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o tag_8021q.o # tagging formats -obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 185629f27f80..c7fa85fb3086 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1044,6 +1044,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) dp->ds = ds; dp->index = index; + dp->bridge_num = -1; INIT_LIST_HEAD(&dp->list); list_add_tail(&dp->list, &dst->ports); @@ -1265,6 +1266,9 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return -EEXIST; } + if (ds->dst->last_switch < ds->index) + ds->dst->last_switch = ds->index; + return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f201c33980bf..e43c5dc04282 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,6 +14,8 @@ #include <net/dsa.h> #include <net/gro_cells.h> +#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG + enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, @@ -39,6 +41,8 @@ enum { DSA_NOTIFIER_MRP_DEL, DSA_NOTIFIER_MRP_ADD_RING_ROLE, DSA_NOTIFIER_MRP_DEL_RING_ROLE, + DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, + DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -113,6 +117,14 @@ struct dsa_notifier_mrp_ring_role_info { int port; }; +/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */ +struct dsa_notifier_tag_8021q_vlan_info { + int tree_index; + int sw_index; + int port; + u16 vid; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; @@ -194,16 +206,14 @@ void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack); -int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, - struct netlink_ext_ack *extack); +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_lag_change(struct dsa_port *dp, struct netdev_lag_lower_state_info *linfo); int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack); -int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev, - struct netlink_ext_ack *extack); +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack); @@ -253,16 +263,18 @@ int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid); +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, - struct net_device *dev) + const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, - struct net_device *bridge_dev) + const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. @@ -272,7 +284,7 @@ static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, - struct net_device *dev) + const struct net_device *dev) { struct dsa_port *dp; @@ -283,6 +295,19 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, return false; } +/* Returns true if any port of this tree offloads the given bridge */ +static inline bool dsa_tree_offloads_bridge(struct dsa_switch_tree *dst, + const struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_bridge(dp, bridge_dev)) + return true; + + return false; +} + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; extern struct notifier_block dsa_slave_switchdev_notifier; @@ -372,6 +397,63 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) return skb; } +/* For switches without hardware support for DSA tagging to be able + * to support termination through the bridge. + */ +static inline struct net_device * +dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct bridge_vlan_info vinfo; + struct net_device *slave; + struct dsa_port *dp; + int err; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->type != DSA_PORT_TYPE_USER) + continue; + + if (!dp->bridge_dev) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + /* Since the bridge might learn this packet, keep the CPU port + * affinity with the port that will be used for the reply on + * xmit. + */ + if (dp->cpu_dp != cpu_dp) + continue; + + slave = dp->slave; + + err = br_vlan_get_info_rcu(slave, vid, &vinfo); + if (err) + continue; + + return slave; + } + + return NULL; +} + +/* If the ingress port offloads the bridge, we mark the frame as autonomously + * forwarded by hardware, so the software bridge doesn't forward in twice, back + * to us, because we already did. However, if we're in fallback mode and we do + * software bridging, we are not offloading it, therefore the dp->bridge_dev + * pointer is not populated, and flooding needs to be done by software (we are + * effectively operating in standalone ports mode). + */ +static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + + skb->offload_fwd_mark = !!(dp->bridge_dev); +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); @@ -386,6 +468,16 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops); +/* tag_8021q.c */ +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); + extern struct list_head dsa_tree_list; #endif diff --git a/net/dsa/master.c b/net/dsa/master.c index 3fc90e36772d..e8e19857621b 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -210,14 +210,14 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (dev->netdev_ops->ndo_do_ioctl) - err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); + if (dev->netdev_ops->ndo_eth_ioctl) + err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd); return err; } static const struct dsa_netdevice_ops dsa_netdev_ops = { - .ndo_do_ioctl = dsa_master_ioctl, + .ndo_eth_ioctl = dsa_master_ioctl, }; static int dsa_master_ethtool_setup(struct net_device *dev) diff --git a/net/dsa/port.c b/net/dsa/port.c index 28b45b7e66df..b927d94b6934 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -167,8 +167,8 @@ static void dsa_port_clear_brport_flags(struct dsa_port *dp) } } -static int dsa_port_switchdev_sync(struct dsa_port *dp, - struct netlink_ext_ack *extack) +static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp, + struct netlink_ext_ack *extack) { struct net_device *brport_dev = dsa_port_to_bridge_port(dp); struct net_device *br = dp->bridge_dev; @@ -194,59 +194,6 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; - err = br_mdb_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - /* Forwarding and termination FDB entries on the port */ - err = br_fdb_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - /* Termination FDB entries on the bridge itself */ - err = br_fdb_replay(br, br, dp, true, &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_vlan_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - return 0; -} - -static int dsa_port_switchdev_unsync_objs(struct dsa_port *dp, - struct net_device *br, - struct netlink_ext_ack *extack) -{ - struct net_device *brport_dev = dsa_port_to_bridge_port(dp); - int err; - - /* Delete the switchdev objects left on this port */ - err = br_mdb_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - /* Forwarding and termination FDB entries on the port */ - err = br_fdb_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - /* Termination FDB entries on the bridge itself */ - err = br_fdb_replay(br, br, dp, false, &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_vlan_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - return 0; } @@ -283,6 +230,83 @@ static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp) */ } +static int dsa_tree_find_bridge_num(struct dsa_switch_tree *dst, + struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + /* When preparing the offload for a port, it will have a valid + * dp->bridge_dev pointer but a not yet valid dp->bridge_num. + * However there might be other ports having the same dp->bridge_dev + * and a valid dp->bridge_num, so just ignore this port. + */ + list_for_each_entry(dp, &dst->ports, list) + if (dp->bridge_dev == bridge_dev && dp->bridge_num != -1) + return dp->bridge_num; + + return -1; +} + +static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst = dp->ds->dst; + int bridge_num = dp->bridge_num; + struct dsa_switch *ds = dp->ds; + + /* No bridge TX forwarding offload => do nothing */ + if (!ds->ops->port_bridge_tx_fwd_unoffload || dp->bridge_num == -1) + return; + + dp->bridge_num = -1; + + /* Check if the bridge is still in use, otherwise it is time + * to clean it up so we can reuse this bridge_num later. + */ + if (!dsa_tree_find_bridge_num(dst, bridge_dev)) + clear_bit(bridge_num, &dst->fwd_offloading_bridges); + + /* Notify the chips only once the offload has been deactivated, so + * that they can update their configuration accordingly. + */ + ds->ops->port_bridge_tx_fwd_unoffload(ds, dp->index, bridge_dev, + bridge_num); +} + +static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst = dp->ds->dst; + struct dsa_switch *ds = dp->ds; + int bridge_num, err; + + if (!ds->ops->port_bridge_tx_fwd_offload) + return false; + + bridge_num = dsa_tree_find_bridge_num(dst, bridge_dev); + if (bridge_num < 0) { + /* First port that offloads TX forwarding for this bridge */ + bridge_num = find_first_zero_bit(&dst->fwd_offloading_bridges, + DSA_MAX_NUM_OFFLOADING_BRIDGES); + if (bridge_num >= ds->num_fwd_offloading_bridges) + return false; + + set_bit(bridge_num, &dst->fwd_offloading_bridges); + } + + dp->bridge_num = bridge_num; + + /* Notify the driver */ + err = ds->ops->port_bridge_tx_fwd_offload(ds, dp->index, bridge_dev, + bridge_num); + if (err) { + dsa_port_bridge_tx_fwd_unoffload(dp, bridge_dev); + return false; + } + + return true; +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack) { @@ -292,6 +316,9 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, .port = dp->index, .br = br, }; + struct net_device *dev = dp->slave; + struct net_device *brport_dev; + bool tx_fwd_offload; int err; /* Here the interface is already bridged. Reflect the current @@ -299,16 +326,31 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, */ dp->bridge_dev = br; + brport_dev = dsa_port_to_bridge_port(dp); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); if (err) goto out_rollback; - err = dsa_port_switchdev_sync(dp, extack); + tx_fwd_offload = dsa_port_bridge_tx_fwd_offload(dp, br); + + err = switchdev_bridge_port_offload(brport_dev, dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier, + tx_fwd_offload, extack); if (err) goto out_rollback_unbridge; + err = dsa_port_switchdev_sync_attrs(dp, extack); + if (err) + goto out_rollback_unoffload; + return 0; +out_rollback_unoffload: + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); out_rollback_unbridge: dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); out_rollback: @@ -316,10 +358,13 @@ out_rollback: return err; } -int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, - struct netlink_ext_ack *extack) +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br) { - return dsa_port_switchdev_unsync_objs(dp, br, extack); + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); } void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) @@ -337,6 +382,8 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; + dsa_port_bridge_tx_fwd_unoffload(dp, br); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); @@ -409,13 +456,10 @@ err_lag_join: return err; } -int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag, - struct netlink_ext_ack *extack) +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag) { if (dp->bridge_dev) - return dsa_port_pre_bridge_leave(dp, dp->bridge_dev, extack); - - return 0; + dsa_port_pre_bridge_leave(dp, dp->bridge_dev); } void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) @@ -844,7 +888,6 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { - cpu_dp->filter = tag_ops->filter; cpu_dp->rcv = tag_ops->rcv; cpu_dp->tag_ops = tag_ops; } @@ -1217,3 +1260,31 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n"); } + +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + + return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info); +} + +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + int err; + + err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info); + if (err) + pr_err("DSA: failed to notify tag_8021q VLAN deletion: %pe\n", + ERR_PTR(err)); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 532085da8d8f..6e1135d3ee33 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1687,7 +1687,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_fdb_dump = dsa_slave_fdb_dump, - .ndo_do_ioctl = dsa_slave_ioctl, + .ndo_eth_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_slave_netpoll_setup, @@ -2056,20 +2056,16 @@ static int dsa_slave_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct netlink_ext_ack *extack; - int err = 0; - - extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev) && !info->linking) - err = dsa_port_pre_bridge_leave(dp, info->upper_dev, extack); + dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) - err = dsa_port_pre_lag_leave(dp, info->upper_dev, extack); + dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be * meaningfully enslaved to a bridge yet */ - return notifier_from_errno(err); + return NOTIFY_DONE; } static int @@ -2357,26 +2353,98 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) kfree(switchdev_work); } -static int dsa_lower_dev_walk(struct net_device *lower_dev, - struct netdev_nested_priv *priv) +static bool dsa_foreign_dev_check(const struct net_device *dev, + const struct net_device *foreign_dev) { - if (dsa_slave_dev_check(lower_dev)) { - priv->data = (void *)netdev_priv(lower_dev); - return 1; - } + const struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch_tree *dst = dp->ds->dst; - return 0; + if (netif_is_bridge_master(foreign_dev)) + return !dsa_tree_offloads_bridge(dst, foreign_dev); + + if (netif_is_bridge_port(foreign_dev)) + return !dsa_tree_offloads_bridge_port(dst, foreign_dev); + + /* Everything else is foreign */ + return true; } -static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev) +static int dsa_slave_fdb_event(struct net_device *dev, + const struct net_device *orig_dev, + const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info, + unsigned long event) { - struct netdev_nested_priv priv = { - .data = NULL, - }; + struct dsa_switchdev_event_work *switchdev_work; + struct dsa_port *dp = dsa_slave_to_port(dev); + bool host_addr = fdb_info->is_local; + struct dsa_switch *ds = dp->ds; - netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv); + if (ctx && ctx != dp) + return 0; + + if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + if (dsa_slave_dev_check(orig_dev) && + switchdev_fdb_is_dynamically_learned(fdb_info)) + return 0; + + /* FDB entries learned by the software bridge should be installed as + * host addresses only if the driver requests assisted learning. + */ + if (switchdev_fdb_is_dynamically_learned(fdb_info) && + !ds->assisted_learning_on_cpu_port) + return 0; + + /* Also treat FDB entries on foreign interfaces bridged with us as host + * addresses. + */ + if (dsa_foreign_dev_check(dev, orig_dev)) + host_addr = true; + + switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); + if (!switchdev_work) + return -ENOMEM; - return (struct dsa_slave_priv *)priv.data; + netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", + event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", + orig_dev->name, fdb_info->addr, fdb_info->vid, + host_addr ? " as host address" : ""); + + INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work); + switchdev_work->ds = ds; + switchdev_work->port = dp->index; + switchdev_work->event = event; + switchdev_work->dev = dev; + + ether_addr_copy(switchdev_work->addr, fdb_info->addr); + switchdev_work->vid = fdb_info->vid; + switchdev_work->host_addr = host_addr; + + /* Hold a reference for dsa_fdb_offload_notify */ + dev_hold(dev); + dsa_schedule_work(&switchdev_work->work); + + return 0; +} + +static int +dsa_slave_fdb_add_to_device(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info) +{ + return dsa_slave_fdb_event(dev, orig_dev, ctx, fdb_info, + SWITCHDEV_FDB_ADD_TO_DEVICE); +} + +static int +dsa_slave_fdb_del_to_device(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info) +{ + return dsa_slave_fdb_event(dev, orig_dev, ctx, fdb_info, + SWITCHDEV_FDB_DEL_TO_DEVICE); } /* Called under rcu_read_lock() */ @@ -2384,10 +2452,6 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - const struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_switchdev_event_work *switchdev_work; - bool host_addr = false; - struct dsa_port *dp; int err; switch (event) { @@ -2397,92 +2461,19 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, dsa_slave_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: + err = switchdev_handle_fdb_add_to_device(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_fdb_add_to_device, + NULL); + return notifier_from_errno(err); case SWITCHDEV_FDB_DEL_TO_DEVICE: - fdb_info = ptr; - - if (dsa_slave_dev_check(dev)) { - dp = dsa_slave_to_port(dev); - - if (fdb_info->is_local) - host_addr = true; - else if (!fdb_info->added_by_user) - return NOTIFY_OK; - } else { - /* Snoop addresses added to foreign interfaces - * bridged with us, or the bridge - * itself. Dynamically learned addresses can - * also be added for switches that don't - * automatically learn SA from CPU-injected - * traffic. - */ - struct net_device *br_dev; - struct dsa_slave_priv *p; - - if (netif_is_bridge_master(dev)) - br_dev = dev; - else - br_dev = netdev_master_upper_dev_get_rcu(dev); - - if (!br_dev) - return NOTIFY_DONE; - - if (!netif_is_bridge_master(br_dev)) - return NOTIFY_DONE; - - p = dsa_slave_dev_lower_find(br_dev); - if (!p) - return NOTIFY_DONE; - - dp = p->dp; - host_addr = fdb_info->is_local; - - /* FDB entries learned by the software bridge should - * be installed as host addresses only if the driver - * requests assisted learning. - * On the other hand, FDB entries for local termination - * should always be installed. - */ - if (!fdb_info->added_by_user && !fdb_info->is_local && - !dp->ds->assisted_learning_on_cpu_port) - return NOTIFY_DONE; - - /* When the bridge learns an address on an offloaded - * LAG we don't want to send traffic to the CPU, the - * other ports bridged with the LAG should be able to - * autonomously forward towards it. - * On the other hand, if the address is local - * (therefore not learned) then we want to trap it to - * the CPU regardless of whether the interface it - * belongs to is offloaded or not. - */ - if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev) && - !fdb_info->is_local) - return NOTIFY_DONE; - } - - if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del) - return NOTIFY_DONE; - - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); - if (!switchdev_work) - return NOTIFY_BAD; - - INIT_WORK(&switchdev_work->work, - dsa_slave_switchdev_event_work); - switchdev_work->ds = dp->ds; - switchdev_work->port = dp->index; - switchdev_work->event = event; - switchdev_work->dev = dev; - - ether_addr_copy(switchdev_work->addr, - fdb_info->addr); - switchdev_work->vid = fdb_info->vid; - switchdev_work->host_addr = host_addr; - - /* Hold a reference for dsa_fdb_offload_notify */ - dev_hold(dev); - dsa_schedule_work(&switchdev_work->work); - break; + err = switchdev_handle_fdb_del_to_device(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_fdb_del_to_device, + NULL); + return notifier_from_errno(err); default: return NOTIFY_DONE; } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5ece05dfd8f2..fd1a1c6bf9cf 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -90,18 +90,25 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { struct dsa_switch_tree *dst = ds->dst; + int err; if (dst->index == info->tree_index && ds->index == info->sw_index && - ds->ops->port_bridge_join) - return ds->ops->port_bridge_join(ds, info->port, info->br); + ds->ops->port_bridge_join) { + err = ds->ops->port_bridge_join(ds, info->port, info->br); + if (err) + return err; + } if ((dst->index != info->tree_index || ds->index != info->sw_index) && - ds->ops->crosschip_bridge_join) - return ds->ops->crosschip_bridge_join(ds, info->tree_index, - info->sw_index, - info->port, info->br); + ds->ops->crosschip_bridge_join) { + err = ds->ops->crosschip_bridge_join(ds, info->tree_index, + info->sw_index, + info->port, info->br); + if (err) + return err; + } - return 0; + return dsa_tag_8021q_bridge_join(ds, info); } static int dsa_switch_bridge_leave(struct dsa_switch *ds, @@ -151,7 +158,8 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (err && err != EOPNOTSUPP) return err; } - return 0; + + return dsa_tag_8021q_bridge_leave(ds, info); } /* Matches for all upstream-facing ports (the CPU port and all upstream-facing @@ -726,6 +734,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MRP_DEL_RING_ROLE: err = dsa_switch_mrp_del_ring_role(ds, info); break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD: + err = dsa_switch_tag_8021q_vlan_add(ds, info); + break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL: + err = dsa_switch_tag_8021q_vlan_del(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 4aa29f90ecea..654697ebb6f3 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -17,7 +17,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | + * | DIR | VBID| SWITCH_ID | VBID | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -27,24 +27,14 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * SVL/SUBVLAN - { VID[9], VID[5:4] }: - * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. - * * 0 (0b000): Field does not encode a sub-VLAN, either because - * received traffic is untagged, PVID-tagged or because a second - * VLAN tag is present after this tag and not inside of it. - * * 1 (0b001): Received traffic is tagged with a VID value private - * to the host. This field encodes the index in the host's lookup - * table through which the value of the ingress VLAN ID can be - * recovered. - * * 2 (0b010): Field encodes a sub-VLAN. - * ... - * * 7 (0b111): Field encodes a sub-VLAN. - * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero - * (by the host) and ignored on receive (by the switch). - * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * + * VBID - { VID[9], VID[5:4] }: + * Virtual bridge ID. If between 1 and 7, packet targets the broadcast + * domain of a bridge. If transmitted as zero, packet targets a single + * port. Field only valid on transmit, must be ignored on receive. + * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -61,23 +51,30 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) -#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 -#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) -#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 -#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(5, 4) -#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) -#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) -#define DSA_8021Q_SUBVLAN(x) \ - (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ - DSA_8021Q_SUBVLAN_LO_MASK) | \ - ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ - DSA_8021Q_SUBVLAN_HI_MASK)) +#define DSA_8021Q_VBID_HI_SHIFT 9 +#define DSA_8021Q_VBID_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_VBID_LO_SHIFT 4 +#define DSA_8021Q_VBID_LO_MASK GENMASK(5, 4) +#define DSA_8021Q_VBID_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_VBID_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_VBID(x) \ + (((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \ + DSA_8021Q_VBID_LO_MASK) | \ + ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \ + DSA_8021Q_VBID_HI_MASK)) #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) +u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num) +{ + /* The VBID value of 0 is reserved for precise TX */ + return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num + 1); +} +EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); + /* Returns the VID to be inserted into the frame from xmit for switch steering * instructions on egress. Encodes switch ID and port ID. */ @@ -98,13 +95,6 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); -u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) -{ - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); - /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { @@ -119,20 +109,6 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); -/* Returns the decoded subvlan from the RX VID. */ -u16 dsa_8021q_rx_subvlan(u16 vid) -{ - u16 svl_hi, svl_lo; - - svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> - DSA_8021Q_SUBVLAN_HI_SHIFT; - svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> - DSA_8021Q_SUBVLAN_LO_SHIFT; - - return (svl_hi << 2) | svl_lo; -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); - bool vid_is_dsa_8021q_rxvlan(u16 vid) { return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; @@ -151,21 +127,152 @@ bool vid_is_dsa_8021q(u16 vid) } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -/* If @enabled is true, installs @vid with @flags into the switch port's HW - * filter. - * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the - * user explicitly configured this @vid through the bridge core, then the @vid - * is installed again, but this time with the flags from the bridge layer. - */ -static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, - u16 flags, bool enabled) +static struct dsa_tag_8021q_vlan * +dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid) { - struct dsa_port *dp = dsa_to_port(ctx->ds, port); + struct dsa_tag_8021q_vlan *v; - if (enabled) - return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags); + list_for_each_entry(v, &ctx->vlans, list) + if (v->vid == vid && v->port == port) + return v; - return ctx->ops->vlan_del(ctx->ds, dp->index, vid); + return NULL; +} + +static int dsa_switch_do_tag_8021q_vlan_add(struct dsa_switch *ds, int port, + u16 vid, u16 flags) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_tag_8021q_vlan *v; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (v) { + refcount_inc(&v->refcount); + return 0; + } + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) + return -ENOMEM; + + err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + if (err) { + kfree(v); + return err; + } + + v->vid = vid; + v->port = port; + refcount_set(&v->refcount, 1); + list_add_tail(&v->list, &ctx->vlans); + + return 0; +} + +static int dsa_switch_do_tag_8021q_vlan_del(struct dsa_switch *ds, int port, + u16 vid) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_tag_8021q_vlan *v; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_del(ds, port, vid); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (!v) + return -ENOENT; + + if (!refcount_dec_and_test(&v->refcount)) + return 0; + + err = ds->ops->tag_8021q_vlan_del(ds, port, vid); + if (err) { + refcount_inc(&v->refcount); + return err; + } + + list_del(&v->list); + kfree(v); + + return 0; +} + +static bool +dsa_switch_tag_8021q_vlan_match(struct dsa_switch *ds, int port, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + return true; + + if (ds->dst->index == info->tree_index && ds->index == info->sw_index) + return port == info->port; + + return false; +} + +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + int port, err; + + /* Since we use dsa_broadcast(), there might be other switches in other + * trees which don't support tag_8021q, so don't return an error. + * Or they might even support tag_8021q but have not registered yet to + * use it (maybe they use another tagger currently). + */ + if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { + u16 flags = 0; + + if (dsa_is_user_port(ds, port)) + flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (vid_is_dsa_8021q_rxvlan(info->vid) && + dsa_8021q_rx_switch_id(info->vid) == ds->index && + dsa_8021q_rx_source_port(info->vid) == port) + flags |= BRIDGE_VLAN_INFO_PVID; + + err = dsa_switch_do_tag_8021q_vlan_add(ds, port, + info->vid, + flags); + if (err) + return err; + } + } + + return 0; +} + +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + int port, err; + + if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { + err = dsa_switch_do_tag_8021q_vlan_del(ds, port, + info->vid); + if (err) + return err; + } + } + + return 0; } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -181,12 +288,6 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * force all switched traffic to pass through the CPU. So we must also make * the other front-panel ports members of this VID we're adding, albeit * we're not making it their PVID (they'll still have their own). - * By the way - just because we're installing the same VID in multiple - * switch ports doesn't mean that they'll start to talk to one another, even - * while not bridged: the final forwarding decision is still an AND between - * the L2 forwarding information (which is limiting forwarding in this case) - * and the VLAN-based restrictions (of which there are none in this case, - * since all ports are members). * - On TX (ingress from CPU and towards network) we are faced with a problem. * If we were to tag traffic (from within DSA) with the port's pvid, all * would be well, assuming the switch ports were standalone. Frames would @@ -200,9 +301,10 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * a member of the VID we're tagging the traffic with - the desired one. * * So at the end, each front-panel port will have one RX VID (also the PVID), - * the RX VID of all other front-panel ports, and one TX VID. Whereas the CPU - * port will have the RX and TX VIDs of all front-panel ports, and on top of - * that, is also tagged-input and tagged-output (VLAN trunk). + * the RX VID of all other front-panel ports that are in the same bridge, and + * one TX VID. Whereas the CPU port will have the RX and TX VIDs of all + * front-panel ports, and on top of that, is also tagged-input and + * tagged-output (VLAN trunk). * * CPU port CPU port * +-------------+-----+-------------+ +-------------+-----+-------------+ @@ -220,246 +322,245 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port, - bool enabled) +static bool dsa_tag_8021q_bridge_match(struct dsa_switch *ds, int port, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + + /* Don't match on self */ + if (ds->dst->index == info->tree_index && + ds->index == info->sw_index && + port == info->port) + return false; + + if (dsa_port_is_user(dp)) + return dp->bridge_dev == info->br; + + return false; +} + +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + u16 targeted_rx_vid; + int err, port; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + + for (port = 0; port < ds->num_ports; port++) { + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + if (!dsa_tag_8021q_bridge_match(ds, port, info)) + continue; + + /* Install the RX VID of the targeted port in our VLAN table */ + err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid); + if (err) + return err; + + /* Install our RX VID into the targeted port's VLAN table */ + err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid); + if (err) + return err; + } + + return 0; +} + +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) { - int upstream = dsa_upstream_port(ctx->ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port); + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + u16 targeted_rx_vid; + int port; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + + for (port = 0; port < ds->num_ports; port++) { + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + if (!dsa_tag_8021q_bridge_match(ds, port, info)) + continue; + + /* Remove the RX VID of the targeted port from our VLAN table */ + dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid); + + /* Remove our RX VID from the targeted port's VLAN table */ + dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid); + } + + return 0; +} + +int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload); + +void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload); + +/* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ +static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ds, port); struct net_device *master; - int i, err, subvlan; + int err; /* The CPU port is implicitly configured by * configuring the front-panel ports */ - if (!dsa_is_user_port(ctx->ds, port)) + if (!dsa_port_is_user(dp)) return 0; - master = dsa_to_port(ctx->ds, port)->cpu_dp->master; + master = dp->cpu_dp->master; /* Add this user port's RX VID to the membership list of all others * (including itself). This is so that bridging will not be hindered. * L2 forwarding rules still take precedence when there are no VLAN * restrictions, so there are no concerns about leaking traffic. */ - for (i = 0; i < ctx->ds->num_ports; i++) { - u16 flags; - - if (i == upstream) - continue; - else if (i == port) - /* The RX VID is pvid on this port */ - flags = BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_PVID; - else - /* The RX VID is a regular VLAN on all others */ - flags = BRIDGE_VLAN_INFO_UNTAGGED; - - err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); - return err; - } - } - - /* CPU port needs to see this port's RX VID - * as tagged egress. - */ - err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, rx_vid); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); + dev_err(ds->dev, + "Failed to apply RX VID %d to port %d: %pe\n", + rx_vid, port, ERR_PTR(err)); return err; } - /* Add to the master's RX filter not only @rx_vid, but in fact - * the entire subvlan range, just in case this DSA switch might - * want to use sub-VLANs. - */ - for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) { - u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan); - - if (enabled) - vlan_vid_add(master, ctx->proto, vid); - else - vlan_vid_del(master, ctx->proto, vid); - } + /* Add @rx_vid to the master's RX filter. */ + vlan_vid_add(master, ctx->proto, rx_vid); /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, - enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, port, err); - return err; - } - err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, tx_vid); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, upstream, err); + dev_err(ds->dev, + "Failed to apply TX VID %d on port %d: %pe\n", + tx_vid, port, ERR_PTR(err)); return err; } return err; } -int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) +static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { - int rc, port; + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ds, port); + struct net_device *master; - ASSERT_RTNL(); + /* The CPU port is implicitly configured by + * configuring the front-panel ports + */ + if (!dsa_port_is_user(dp)) + return; - for (port = 0; port < ctx->ds->num_ports; port++) { - rc = dsa_8021q_setup_port(ctx, port, enabled); - if (rc < 0) { - dev_err(ctx->ds->dev, - "Failed to setup VLAN tagging for port %d: %d\n", - port, rc); - return rc; - } - } + master = dp->cpu_dp->master; - return 0; -} -EXPORT_SYMBOL_GPL(dsa_8021q_setup); + dsa_port_tag_8021q_vlan_del(dp, rx_vid); -static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx, - int port, - struct dsa_8021q_context *other_ctx, - int other_port, bool enabled) -{ - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); + vlan_vid_del(master, ctx->proto, rx_vid); - /* @rx_vid of local @ds port @port goes to @other_port of - * @other_ds - */ - return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid, - BRIDGE_VLAN_INFO_UNTAGGED, enabled); + dsa_port_tag_8021q_vlan_del(dp, tx_vid); } -static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +static int dsa_tag_8021q_setup(struct dsa_switch *ds) { - struct dsa_8021q_crosschip_link *c; + int err, port; + + ASSERT_RTNL(); - list_for_each_entry(c, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - c->other_port == other_port) { - refcount_inc(&c->refcount); - return 0; + for (port = 0; port < ds->num_ports; port++) { + err = dsa_tag_8021q_port_setup(ds, port); + if (err < 0) { + dev_err(ds->dev, + "Failed to setup VLAN tagging for port %d: %pe\n", + port, ERR_PTR(err)); + return err; } } - dev_dbg(ctx->ds->dev, - "adding crosschip link from port %d to %s port %d\n", - port, dev_name(other_ctx->ds->dev), other_port); - - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - - c->port = port; - c->other_ctx = other_ctx; - c->other_port = other_port; - refcount_set(&c->refcount, 1); - - list_add(&c->list, &ctx->crosschip_links); - return 0; } -static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx, - struct dsa_8021q_crosschip_link *c, - bool *keep) +static void dsa_tag_8021q_teardown(struct dsa_switch *ds) { - *keep = !refcount_dec_and_test(&c->refcount); + int port; - if (*keep) - return; - - dev_dbg(ctx->ds->dev, - "deleting crosschip link from port %d to %s port %d\n", - c->port, dev_name(c->other_ctx->ds->dev), c->other_port); + ASSERT_RTNL(); - list_del(&c->list); - kfree(c); + for (port = 0; port < ds->num_ports; port++) + dsa_tag_8021q_port_teardown(ds, port); } -/* Make traffic from local port @port be received by remote port @other_port. - * This means that our @rx_vid needs to be installed on @other_ds's upstream - * and user ports. The user ports should be egress-untagged so that they can - * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged - * or untagged: it doesn't matter, since it should never egress a frame having - * our @rx_vid. - */ -int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) { - /* @other_upstream is how @other_ds reaches us. If we are part - * of disjoint trees, then we are probably connected through - * our CPU ports. If we're part of the same tree though, we should - * probably use dsa_towards_port. - */ - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - int rc; + struct dsa_8021q_context *ctx; - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port); - if (rc) - return rc; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; - rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_port, true); - if (rc) - return rc; + ctx->proto = proto; + ctx->ds = ds; - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream); - if (rc) - return rc; + INIT_LIST_HEAD(&ctx->vlans); - return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_upstream, true); + ds->tag_8021q_ctx = ctx; + + return dsa_tag_8021q_setup(ds); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_register); -int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +void dsa_tag_8021q_unregister(struct dsa_switch *ds) { - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - struct dsa_8021q_crosschip_link *c, *n; - - list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - (c->other_port == other_port || - c->other_port == other_upstream)) { - struct dsa_8021q_context *other_ctx = c->other_ctx; - int other_port = c->other_port; - bool keep; - int rc; - - dsa_8021q_crosschip_link_del(ctx, c, &keep); - if (keep) - continue; - - rc = dsa_8021q_crosschip_link_apply(ctx, port, - other_ctx, - other_port, - false); - if (rc) - return rc; - } + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_tag_8021q_vlan *v, *n; + + dsa_tag_8021q_teardown(ds); + + list_for_each_entry_safe(v, n, &ctx->vlans, list) { + list_del(&v->list); + kfree(v); } - return 0; + ds->tag_8021q_ctx = NULL; + + kfree(ctx); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) @@ -471,8 +572,7 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *subvlan) +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) { u16 vid, tci; @@ -489,9 +589,6 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, *source_port = dsa_8021q_rx_source_port(vid); *switch_id = dsa_8021q_rx_switch_id(vid); - *subvlan = dsa_8021q_rx_subvlan(vid); skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rcv); - -MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 0750af951fc9..a27f5096777a 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -167,7 +167,7 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -271,7 +271,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); /* Move the Ethernet DA and SA */ memmove(skb->data - ETH_HLEN, diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index a822355afc90..3607499d0697 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -126,7 +126,42 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); + u8 tag_dev, tag_port; + enum dsa_cmd cmd; u8 *dsa_header; + u16 pvid = 0; + int err; + + if (skb->offload_fwd_mark) { + struct dsa_switch_tree *dst = dp->ds->dst; + struct net_device *br = dp->bridge_dev; + + cmd = DSA_CMD_FORWARD; + + /* When offloading forwarding for a bridge, inject FORWARD + * packets on behalf of a virtual switch device with an index + * past the physical switches. + */ + tag_dev = dst->last_switch + 1 + dp->bridge_num; + tag_port = 0; + + /* If we are offloading forwarding for a VLAN-unaware bridge, + * inject packets to hardware using the bridge's pvid, since + * that's where the packets ingressed from. + */ + if (!br_vlan_enabled(br)) { + /* Safe because __dev_queue_xmit() runs under + * rcu_read_lock_bh() + */ + err = br_vlan_get_pvid_rcu(br, &pvid); + if (err) + return NULL; + } + } else { + cmd = DSA_CMD_FROM_CPU; + tag_dev = dp->ds->index; + tag_port = dp->index; + } if (skb->protocol == htons(ETH_P_8021Q)) { if (extra) { @@ -134,10 +169,10 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); } - /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ + /* Construct tagged DSA tag from 802.1Q tag. */ dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; - dsa_header[1] = dp->index << 3; + dsa_header[0] = (cmd << 6) | 0x20 | tag_dev; + dsa_header[1] = tag_port << 3; /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { @@ -148,12 +183,13 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, skb_push(skb, DSA_HLEN + extra); memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - /* Construct untagged FROM_CPU DSA tag. */ + /* Construct untagged DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; - dsa_header[1] = dp->index << 3; - dsa_header[2] = 0x00; - dsa_header[3] = 0x00; + + dsa_header[0] = (cmd << 6) | tag_dev; + dsa_header[1] = tag_port << 3; + dsa_header[2] = pvid >> 8; + dsa_header[3] = pvid & 0xff; } return skb; @@ -162,8 +198,8 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { + bool trap = false, trunk = false; int source_device, source_port; - bool trunk = false; enum dsa_code code; enum dsa_cmd cmd; u8 *dsa_header; @@ -174,8 +210,6 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - skb->offload_fwd_mark = 1; - trunk = !!(dsa_header[1] & 7); break; @@ -194,7 +228,6 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, * device (like a bridge) that forwarding has * already been done by hardware. */ - skb->offload_fwd_mark = 1; break; case DSA_CODE_MGMT_TRAP: case DSA_CODE_IGMP_MLD_TRAP: @@ -202,6 +235,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, /* Traps have, by definition, not been * forwarded by hardware, so don't mark them. */ + trap = true; break; default: /* Reserved code, this could be anything. Drop @@ -235,6 +269,15 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; + /* When using LAG offload, skb->dev is not a DSA slave interface, + * so we cannot call dsa_default_offload_fwd_mark and we need to + * special-case it. + */ + if (trunk) + skb->offload_fwd_mark = true; + else if (!trap) + dsa_default_offload_fwd_mark(skb); + /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q * tag, and delete the ethertype (extra) if applicable. If the * 'tagged' bit is cleared; delete the DSA tag, and ethertype diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 424130f85f59..c41208cbd936 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -44,7 +44,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index a201ccf2435d..1c2dfa80f9b0 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -24,7 +24,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - len); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 26207ef39ebc..cf7cf2fa1240 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -115,7 +115,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull_rcsum(skb, 2 + 2); memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU); + if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU)) + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index cc3ba864ad5b..3fb80e43f3a5 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -92,7 +92,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 190f4bfd3bef..3252634a29b8 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -104,7 +104,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, */ return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; /* Ocelot switches copy frames unmodified to the CPU. However, it is diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 85ac85c3af8c..c95de71d13b0 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -41,15 +41,15 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - int src_port, switch_id, subvlan; + int src_port, switch_id; - dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan); + dsa_8021q_rcv(skb, &src_port, &switch_id); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 57c46b4ab2b3..f6b63aad6551 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -114,7 +114,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, skb->data - ETH_HLEN - RTL4_A_HDR_LEN, 2 * ETH_ALEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c2df9ece01b..664cb802b71a 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -115,40 +115,6 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } -static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) -{ - struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); - u16 vlan_tci; - - if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) - return true; - - if (hdr->h_vlan_proto != htons(ETH_P_8021Q) && - !skb_vlan_tag_present(skb)) - return false; - - if (skb_vlan_tag_present(skb)) - vlan_tci = skb_vlan_tag_get(skb); - else - vlan_tci = ntohs(hdr->h_vlan_TCI); - - return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK); -} - -/* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it. - */ -static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) -{ - if (sja1105_can_use_vlan_as_tags(skb)) - return true; - if (sja1105_is_link_local(skb)) - return true; - if (sja1105_is_meta_frame(skb)) - return true; - return false; -} - /* Calls sja1105_port_deferred_xmit in sja1105_main.c */ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, struct sk_buff *skb) @@ -167,6 +133,31 @@ static u16 sja1105_xmit_tpid(struct sja1105_port *sp) return sp->xmit_tpid; } +static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + struct net_device *br = dp->bridge_dev; + u16 tx_vid; + + /* If the port is under a VLAN-aware bridge, just slide the + * VLAN-tagged packet into the FDB and hope for the best. + * This works because we support a single VLAN-aware bridge + * across the entire dst, and its VLANs cannot be shared with + * any standalone port. + */ + if (br_vlan_enabled(br)) + return skb; + + /* If the port is under a VLAN-unaware bridge, use an imprecise + * TX VLAN that targets the bridge's entire broadcast domain, + * instead of just the specific port. + */ + tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(dp->bridge_num); + + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), tx_vid); +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -175,6 +166,9 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); + /* Transmitting management traffic does not rely upon switch tagging, * but instead SPI-installed management routes. Part 2 of this * is the .port_deferred_xmit driver callback. @@ -199,6 +193,9 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, __be16 *tx_header; int trailer_pos; + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); + /* Transmitting control packets is done using in-band control * extensions, while data packets are transmitted using * tag_8021q TX VLANs. @@ -358,20 +355,6 @@ static struct sk_buff return skb; } -static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) -{ - struct dsa_port *dp = dsa_slave_to_port(skb->dev); - struct sja1105_port *sp = dp->priv; - u16 vid = sp->subvlan_map[subvlan]; - u16 vlan_tci; - - if (vid == VLAN_N_VID) - return; - - vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); -} - static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) { u16 tpid = ntohs(eth_hdr(skb)->h_proto); @@ -385,25 +368,46 @@ static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110; } +/* If the VLAN in the packet is a tag_8021q one, set @source_port and + * @switch_id and strip the header. Otherwise set @vid and keep it in the + * packet. + */ +static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, + int *switch_id, u16 *vid) +{ + struct vlan_ethhdr *hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + u16 vlan_tci; + + if (skb_vlan_tag_present(skb)) + vlan_tci = skb_vlan_tag_get(skb); + else + vlan_tci = ntohs(hdr->h_vlan_TCI); + + if (vid_is_dsa_8021q_rxvlan(vlan_tci & VLAN_VID_MASK)) + return dsa_8021q_rcv(skb, source_port, switch_id); + + /* Try our best with imprecise RX */ + *vid = vlan_tci & VLAN_VID_MASK; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - int source_port, switch_id, subvlan = 0; + int source_port = -1, switch_id = -1; struct sja1105_meta meta = {0}; struct ethhdr *hdr; bool is_link_local; bool is_meta; + u16 vid; hdr = eth_hdr(skb); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); - skb->offload_fwd_mark = 1; - if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -422,14 +426,17 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); if (!skb->dev) { netdev_warn(netdev, "Couldn't decode source port\n"); return NULL; } - if (subvlan) - sja1105_decode_subvlan(skb, subvlan); + if (!is_link_local) + dsa_default_offload_fwd_mark(skb); return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); @@ -474,7 +481,8 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, int *source_port, - int *switch_id) + int *switch_id, + bool *host_only) { u16 rx_header; @@ -488,6 +496,9 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, */ rx_header = ntohs(*(__be16 *)skb->data); + if (rx_header & SJA1110_RX_HEADER_HOST_ONLY) + *host_only = true; + if (rx_header & SJA1110_RX_HEADER_IS_METADATA) return sja1110_rcv_meta(skb, rx_header); @@ -538,31 +549,33 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - int source_port = -1, switch_id = -1, subvlan = 0; - - skb->offload_fwd_mark = 1; + int source_port = -1, switch_id = -1; + bool host_only = false; + u16 vid; if (sja1110_skb_has_inband_control_extension(skb)) { skb = sja1110_rcv_inband_control_extension(skb, &source_port, - &switch_id); + &switch_id, + &host_only); if (!skb) return NULL; } /* Packets with in-band control extensions might still have RX VLANs */ if (likely(sja1105_skb_has_tag_8021q(skb))) - dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); - skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); if (!skb->dev) { - netdev_warn(netdev, - "Couldn't decode source port %d and switch id %d\n", - source_port, switch_id); + netdev_warn(netdev, "Couldn't decode source port\n"); return NULL; } - if (subvlan) - sja1105_decode_subvlan(skb, subvlan); + if (!host_only) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -596,7 +609,6 @@ static const struct dsa_device_ops sja1105_netdev_ops = { .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, .rcv = sja1105_rcv, - .filter = sja1105_filter, .needed_headroom = VLAN_HLEN, .flow_dissect = sja1105_flow_dissect, .promisc_on_master = true, @@ -610,7 +622,6 @@ static const struct dsa_device_ops sja1110_netdev_ops = { .proto = DSA_TAG_PROTO_SJA1110, .xmit = sja1110_xmit, .rcv = sja1110_rcv, - .filter = sja1105_filter, .flow_dissect = sja1110_flow_dissect, .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN, .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN, diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index a31ff7fcb45f..da231c16ac82 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -46,7 +46,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; /* Frame is forwarded by hardware, don't forward in software. */ - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9cce612e8976..171ba75b74c9 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -182,12 +182,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. - * The DSA tagging protocol may be able to decode some but not all - * traffic (for example only for management). In that case give it the - * option to filter the packets from which it can decode source port - * information. */ - if (unlikely(netdev_uses_dsa(dev)) && dsa_can_decode(skb, dev)) + if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index baa5d10043cb..b0fa2b00ad43 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -7,6 +7,7 @@ * the information ethtool needs. */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> @@ -807,6 +808,120 @@ out: return ret; } +static noinline_for_stack int +ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, + const struct compat_ethtool_rxnfc __user *useraddr, + size_t size) +{ + struct compat_ethtool_rxnfc crxnfc = {}; + + /* We expect there to be holes between fs.m_ext and + * fs.ring_cookie and at the end of fs, but nowhere else. + * On non-x86, no conversion should be needed. + */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && + sizeof(struct compat_ethtool_rxnfc) != + sizeof(struct ethtool_rxnfc)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + + sizeof(useraddr->fs.m_ext) != + offsetof(struct ethtool_rxnfc, fs.m_ext) + + sizeof(rxnfc->fs.m_ext)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) + return -EFAULT; + + *rxnfc = (struct ethtool_rxnfc) { + .cmd = crxnfc.cmd, + .flow_type = crxnfc.flow_type, + .data = crxnfc.data, + .fs = { + .flow_type = crxnfc.fs.flow_type, + .h_u = crxnfc.fs.h_u, + .h_ext = crxnfc.fs.h_ext, + .m_u = crxnfc.fs.m_u, + .m_ext = crxnfc.fs.m_ext, + .ring_cookie = crxnfc.fs.ring_cookie, + .location = crxnfc.fs.location, + }, + .rule_cnt = crxnfc.rule_cnt, + }; + + return 0; +} + +static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, + const void __user *useraddr, + size_t size) +{ + if (compat_need_64bit_alignment_fixup()) + return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); + + if (copy_from_user(rxnfc, useraddr, size)) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + struct compat_ethtool_rxnfc crxnfc; + + memset(&crxnfc, 0, sizeof(crxnfc)); + crxnfc = (struct compat_ethtool_rxnfc) { + .cmd = rxnfc->cmd, + .flow_type = rxnfc->flow_type, + .data = rxnfc->data, + .fs = { + .flow_type = rxnfc->fs.flow_type, + .h_u = rxnfc->fs.h_u, + .h_ext = rxnfc->fs.h_ext, + .m_u = rxnfc->fs.m_u, + .m_ext = rxnfc->fs.m_ext, + .ring_cookie = rxnfc->fs.ring_cookie, + .location = rxnfc->fs.location, + }, + .rule_cnt = rxnfc->rule_cnt, + }; + + if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_user(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + int ret; + + if (compat_need_64bit_alignment_fixup()) { + ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, + rule_buf); + useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); + } else { + ret = copy_to_user(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + } + + if (ret) + return -EFAULT; + + if (rule_buf) { + if (copy_to_user(useraddr, rule_buf, + rxnfc->rule_cnt * sizeof(u32))) + return -EFAULT; + } + + return 0; +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -825,7 +940,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; rc = dev->ethtool_ops->set_rxnfc(dev, &info); @@ -833,7 +948,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; if (cmd == ETHTOOL_SRXCLSRLINS && - copy_to_user(useraddr, &info, info_size)) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; @@ -859,7 +974,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* If FLOW_RSS was requested then user-space must be using the @@ -867,7 +982,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, */ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { info_size = sizeof(info); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. @@ -893,18 +1008,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (ret < 0) goto err_out; - ret = -EFAULT; - if (copy_to_user(useraddr, &info, info_size)) - goto err_out; - - if (rule_buf) { - useraddr += offsetof(struct ethtool_rxnfc, rule_locs); - if (copy_to_user(useraddr, rule_buf, - info.rule_cnt * sizeof(u32))) - goto err_out; - } - ret = 0; - + ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); @@ -2581,10 +2685,9 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) /* The main entry point in this file. Called from net/core/dev_ioctl.c */ -int dev_ethtool(struct net *net, struct ifreq *ifr) +int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - void __user *useraddr = ifr->ifr_data; u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a45a0401adc5..f5077de3619e 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -129,7 +129,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, int ret = -ENOIOCTLCMD; struct net_device *dev; - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; @@ -143,7 +143,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 54648181dd56..0e4d758c2585 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -953,10 +953,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); - if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq))) + if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; @@ -966,7 +966,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 73721a4448bd..c82aded8da7d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -215,7 +215,7 @@ static void devinet_sysctl_unregister(struct in_device *idev) static struct in_ifaddr *inet_alloc_ifa(void) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); + return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT); } static void inet_rcu_free_ifa(struct rcu_head *head) @@ -1243,7 +1243,7 @@ out: return ret; } -static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) +int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); const struct in_ifaddr *ifa; @@ -2424,11 +2424,15 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + struct net *net = ctl->extra2; + int ret; - if (write && *valp != val) { - struct net *net = ctl->extra2; + if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_trylock()) { /* Restore the original values before restarting */ @@ -2762,8 +2766,6 @@ void __init devinet_init(void) INIT_HLIST_HEAD(&inet_addr_lst[i]); register_pernet_subsys(&devinet_ops); - - register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4c0c33e4710d..fa19f4cdf3a4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { spin_lock_bh(&fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -1373,7 +1373,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { - fi->fib_treeref++; + refcount_inc(&fi->fib_treeref); return fi; } } @@ -1547,11 +1547,11 @@ link_it: if (ofi) { fi->fib_dead = 1; free_fib_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_inc(&fi->fib_treeref); refcount_set(&fi->fib_clntref, 1); spin_lock_bh(&fib_info_lock); hlist_add_head(&fi->fib_hash, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 25cf387cca5b..8060524f4256 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2380,11 +2380,11 @@ void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6b3c558a4f23..03589a04f9aa 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2713,6 +2713,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { + spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; @@ -2723,6 +2724,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; + spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 12dca0c85f3c..6ebf05859acb 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -923,7 +923,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d8a8da3ae7e..a202dcec0dc2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -446,8 +446,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index be75b409445c..fe9101d3d69e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -958,19 +958,20 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); -int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (!err && copy_to_user(data, &p, sizeof(p))) return -EFAULT; return err; } -EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); +EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index eb560eecee08..efe25a0172e6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -405,7 +405,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 266c65577ba6..3aa78ccbec3e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 99c06944501a..04754d55b3c1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1299,26 +1299,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; - unsigned int mtu = rt->rt_pmtu; - - if (!mtu || time_after_eq(jiffies, rt->dst.expires)) - mtu = dst_metric_raw(dst, RTAX_MTU); - - if (mtu) - goto out; - - mtu = READ_ONCE(dst->dev->mtu); - - if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) - mtu = 576; - } - -out: - mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cb44040ec68..f931def6302e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4512,7 +4512,9 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); /* Size and allocate the main established and bind bucket * hash tables. diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 25fa4c01a17f..62ba8d0f2c60 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -55,12 +55,7 @@ void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - - ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); @@ -89,18 +84,12 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, ctx->num = 1; } - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - octx = rcu_dereference_protected(q->ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(q->ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); } else { - octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); } - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 149ceb5c94ff..3f7bd7ae7d7a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ +#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -454,11 +455,12 @@ static void tcp_sndbuf_expand(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, + unsigned int skbtruesize) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { @@ -471,7 +473,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing + * can play nice with us, as sk_buff and skb->head might be either + * freed or shared with up to MAX_SKB_FRAGS segments. + * Only give a boost to drivers using page frag(s) to hold the frame(s), + * and if no payload was pulled in skb->head before reaching us. + */ +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) +{ + u32 truesize = skb->truesize; + + if (adjust && !skb_headlen(skb)) { + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); + /* paranoid check, some drivers might be buggy */ + if (unlikely((int)truesize < (int)skb->len)) + truesize = skb->truesize; + } + return truesize; +} + +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, + bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; @@ -480,15 +502,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (room > 0 && !tcp_under_memory_pressure(sk)) { + unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); @@ -782,7 +805,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -969,6 +992,8 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) + state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ @@ -976,7 +1001,14 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; - tp->rack.dsack_seen = 1; + /* We increase the RACK ordering window in rounds where we receive + * DSACKs that may have been due to reordering causing RACK to trigger + * a spurious fast recovery. Thus RACK ignores DSACKs that happen + * without having seen reordering, or that match TLP probes (TLP + * is timer-driven, not triggered by RACK). + */ + if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) + tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ @@ -3628,7 +3660,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (!tp->tlp_retrans) { /* TLP of new data has been acknowledged */ tp->tlp_high_seq = 0; - } else if (flag & FLAG_DSACKING_ACK) { + } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -4769,7 +4801,7 @@ coalesce_done: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4857,7 +4889,7 @@ end: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } @@ -5383,7 +5415,7 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - sk->sk_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static void tcp_check_space(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a692626c19e4..84db1c9ee92a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2964,7 +2964,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; - spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 6f1b4ac7fe99..fd113f6226ef 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -172,7 +172,8 @@ void tcp_rack_reo_timeout(struct sock *sk) /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * - * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 9f5a5cdc38e6..7a1d5f473878 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -112,7 +112,6 @@ static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; - prot->unhash = sock_map_unhash; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 747f56e0c636..e504204bca92 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -328,4 +328,15 @@ config IPV6_RPL_LWTUNNEL If unsure, say N. +config IPV6_IOAM6_LWTUNNEL + bool "IPv6: IOAM Pre-allocated Trace insertion support" + depends on IPV6 + select LWTUNNEL + help + Support for the inline insertion of IOAM Pre-allocated + Trace Header (only on locally generated packets), using + the lightweight tunnels mechanism. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index cf7b47bdb9b3..1bc7e143217b 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -10,7 +10,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o fib6_notifier.o rpl.o + udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o @@ -27,6 +27,7 @@ ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o +ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3bf685fe64b9..db0a89810f28 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -89,6 +89,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> +#include <linux/ioam6.h> #define INFINITY_LIFE_TIME 0xFFFFFFFF @@ -237,6 +238,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -293,6 +297,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -1080,7 +1087,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - ifa = kzalloc(sizeof(*ifa), gfp_flags); + ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT); if (!ifa) { err = -ENOBUFS; goto out; @@ -5211,8 +5218,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .netnsid = -1, .type = type, }; - struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; + struct net *tgt_net = sock_net(skb->sk); int idx, s_idx, s_ip_idx; int h, s_h; struct net_device *dev; @@ -5351,7 +5357,7 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct net *net = sock_net(in_skb->sk); + struct net *tgt_net = sock_net(in_skb->sk); struct inet6_fill_args fillargs = { .portid = NETLINK_CB(in_skb).portid, .seq = nlh->nlmsg_seq, @@ -5359,7 +5365,6 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .flags = 0, .netnsid = -1, }; - struct net *tgt_net = net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL, *peer; @@ -5526,6 +5531,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; + array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled; + array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; + array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; } static inline size_t inet6_ifla6_size(void) @@ -6540,6 +6548,7 @@ static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, static int minus_one = -1; static const int two_five_five = 255; +static u32 ioam6_if_id_max = U16_MAX; static const struct ctl_table addrconf_sysctl[] = { { @@ -6933,6 +6942,31 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "ioam6_enabled", + .data = &ipv6_devconf.ioam6_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)SYSCTL_ONE, + }, + { + .procname = "ioam6_id", + .data = &ipv6_devconf.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)&ioam6_if_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &ipv6_devconf.ioam6_id_wide, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2389ff702f51..d92c90d97763 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -62,6 +62,7 @@ #include <net/rpl.h> #include <net/compat.h> #include <net/xfrm.h> +#include <net/ioam6.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -961,6 +962,9 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); + net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID; + net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE; + err = ipv6_init_mibs(net); if (err) return err; @@ -1191,6 +1195,10 @@ static int __init inet6_init(void) if (err) goto rpl_fail; + err = ioam6_init(); + if (err) + goto ioam6_fail; + err = igmp6_late_init(); if (err) goto igmp6_late_err; @@ -1213,6 +1221,8 @@ sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: + ioam6_exit(); +ioam6_fail: rpl_exit(); rpl_fail: seg6_exit(); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 26882e165c9e..d897faa4e9e6 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -49,6 +49,9 @@ #include <net/seg6_hmac.h> #endif #include <net/rpl.h> +#include <linux/ioam6.h> +#include <net/ioam6.h> +#include <net/dst_metadata.h> #include <linux/uaccess.h> @@ -928,6 +931,60 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) return false; } +/* IOAM */ + +static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) +{ + struct ioam6_trace_hdr *trace; + struct ioam6_namespace *ns; + struct ioam6_hdr *hdr; + + /* Bad alignment (must be 4n-aligned) */ + if (optoff & 3) + goto drop; + + /* Ignore if IOAM is not enabled on ingress */ + if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) + goto ignore; + + /* Truncated Option header */ + hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); + if (hdr->opt_len < 2) + goto drop; + + switch (hdr->type) { + case IOAM6_TYPE_PREALLOC: + /* Truncated Pre-allocated Trace header */ + if (hdr->opt_len < 2 + sizeof(*trace)) + goto drop; + + /* Malformed Pre-allocated Trace header */ + trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); + if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) + goto drop; + + /* Ignore if the IOAM namespace is unknown */ + ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); + if (!ns) + goto ignore; + + if (!skb_valid_dst(skb)) + ip6_route_input(skb); + + ioam6_fill_trace_data(skb, ns, trace); + break; + default: + break; + } + +ignore: + return true; + +drop: + kfree_skb(skb); + return false; +} + /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) @@ -1000,6 +1057,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { .func = ipv6_hop_ra, }, { + .type = IPV6_TLV_IOAM, + .func = ipv6_hop_ioam, + }, + { .type = IPV6_TLV_JUMBO, .func = ipv6_hop_jumbo, }, diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c new file mode 100644 index 000000000000..5e8961004832 --- /dev/null +++ b/net/ipv6/ioam6.c @@ -0,0 +1,910 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/ioam6.h> +#include <linux/ioam6_genl.h> +#include <linux/rhashtable.h> + +#include <net/addrconf.h> +#include <net/genetlink.h> +#include <net/ioam6.h> + +static void ioam6_ns_release(struct ioam6_namespace *ns) +{ + kfree_rcu(ns, rcu); +} + +static void ioam6_sc_release(struct ioam6_schema *sc) +{ + kfree_rcu(sc, rcu); +} + +static void ioam6_free_ns(void *ptr, void *arg) +{ + struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr; + + if (ns) + ioam6_ns_release(ns); +} + +static void ioam6_free_sc(void *ptr, void *arg) +{ + struct ioam6_schema *sc = (struct ioam6_schema *)ptr; + + if (sc) + ioam6_sc_release(sc); +} + +static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_namespace *ns = obj; + + return (ns->id != *(__be16 *)arg->key); +} + +static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_schema *sc = obj; + + return (sc->id != *(u32 *)arg->key); +} + +static const struct rhashtable_params rht_ns_params = { + .key_len = sizeof(__be16), + .key_offset = offsetof(struct ioam6_namespace, id), + .head_offset = offsetof(struct ioam6_namespace, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_ns_cmpfn, +}; + +static const struct rhashtable_params rht_sc_params = { + .key_len = sizeof(u32), + .key_offset = offsetof(struct ioam6_schema, id), + .head_offset = offsetof(struct ioam6_schema, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_sc_cmpfn, +}; + +static struct genl_family ioam6_genl_family; + +static const struct nla_policy ioam6_genl_policy_addns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 }, + [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 }, +}; + +static const struct nla_policy ioam6_genl_policy_delns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, +}; + +static const struct nla_policy ioam6_genl_policy_addsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY, + .len = IOAM6_MAX_SCHEMA_DATA_LEN }, +}; + +static const struct nla_policy ioam6_genl_policy_delsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ioam6_genl_policy_ns_sc[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG }, +}; + +static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + u64 data64; + u32 data32; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (ns) { + err = -EEXIST; + goto out_unlock; + } + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) { + err = -ENOMEM; + goto out_unlock; + } + + ns->id = id; + + if (!info->attrs[IOAM6_ATTR_NS_DATA]) + data32 = IOAM6_U32_UNAVAILABLE; + else + data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); + + if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) + data64 = IOAM6_U64_UNAVAILABLE; + else + data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + + ns->data = cpu_to_be32(data32); + ns->data_wide = cpu_to_be64(data64); + + err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + kfree(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + sc = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + goto out_unlock; + + if (sc) + rcu_assign_pointer(sc->ns, NULL); + + ioam6_ns_release(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns, + u32 portid, + u32 seq, + u32 flags, + struct sk_buff *skb, + u8 cmd) +{ + struct ioam6_schema *sc; + u64 data64; + u32 data32; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + data32 = be32_to_cpu(ns->data); + data64 = be64_to_cpu(ns->data_wide); + + if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) || + (data32 != IOAM6_U32_UNAVAILABLE && + nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) || + (data64 != IOAM6_U64_UNAVAILABLE && + nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE, + data64, IOAM6_ATTR_PAD))) + goto nla_put_failure; + + rcu_read_lock(); + + sc = rcu_dereference(ns->schema); + if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpns_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->namespaces, iter); + + return 0; +} + +static int ioam6_genl_dumpns_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_namespace *ns; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + ns = rhashtable_walk_next(iter); + + if (IS_ERR(ns)) { + if (PTR_ERR(ns) == -EAGAIN) + continue; + err = PTR_ERR(ns); + goto done; + } else if (!ns) { + break; + } + + err = __ioam6_genl_dumpns_element(ns, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_NAMESPACES); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + int len, len_aligned, err; + struct ioam6_schema *sc; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (sc) { + err = -EEXIST; + goto out_unlock; + } + + len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]); + len_aligned = ALIGN(len, 4); + + sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL); + if (!sc) { + err = -ENOMEM; + goto out_unlock; + } + + sc->id = id; + sc->len = len_aligned; + sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24)); + nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len); + + err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto free_sc; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +free_sc: + kfree(sc); + goto out_unlock; +} + +static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + int err; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + + ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto out_unlock; + + if (ns) + rcu_assign_pointer(ns->schema, NULL); + + ioam6_sc_release(sc); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc, + u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) +{ + struct ioam6_namespace *ns; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) || + nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data)) + goto nla_put_failure; + + rcu_read_lock(); + + ns = rcu_dereference(sc->ns); + if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpsc_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->schemas, iter); + + return 0; +} + +static int ioam6_genl_dumpsc_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_schema *sc; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + sc = rhashtable_walk_next(iter); + + if (IS_ERR(sc)) { + if (PTR_ERR(sc) == -EAGAIN) + continue; + err = PTR_ERR(sc); + goto done; + } else if (!sc) { + break; + } + + err = __ioam6_genl_dumpsc_element(sc, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_SCHEMAS); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_namespace *ns, *ns_ref; + struct ioam6_schema *sc, *sc_ref; + struct ioam6_pernet_data *nsdata; + __be16 ns_id; + u32 sc_id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID] || + (!info->attrs[IOAM6_ATTR_SC_ID] && + !info->attrs[IOAM6_ATTR_SC_NONE])) + return -EINVAL; + + ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + if (info->attrs[IOAM6_ATTR_SC_NONE]) { + sc = NULL; + } else { + sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id, + rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + } + + sc_ref = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + if (sc_ref) + rcu_assign_pointer(sc_ref->ns, NULL); + rcu_assign_pointer(ns->schema, sc); + + if (sc) { + ns_ref = rcu_dereference_protected(sc->ns, + lockdep_is_held(&nsdata->lock)); + if (ns_ref) + rcu_assign_pointer(ns_ref->schema, NULL); + rcu_assign_pointer(sc->ns, ns); + } + + err = 0; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static const struct genl_ops ioam6_genl_ops[] = { + { + .cmd = IOAM6_CMD_ADD_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_NAMESPACES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpns_start, + .dumpit = ioam6_genl_dumpns, + .done = ioam6_genl_dumpns_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_ADD_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_SCHEMAS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpsc_start, + .dumpit = ioam6_genl_dumpsc, + .done = ioam6_genl_dumpsc_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_NS_SET_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_ns_set_schema, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_ns_sc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1, + }, +}; + +static struct genl_family ioam6_genl_family __ro_after_init = { + .name = IOAM6_GENL_NAME, + .version = IOAM6_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ioam6_genl_ops, + .n_ops = ARRAY_SIZE(ioam6_genl_ops), + .module = THIS_MODULE, +}; + +struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); +} + +static void __ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace, + struct ioam6_schema *sc, + u8 sclen) +{ + struct __kernel_sock_timeval ts; + u64 raw64; + u32 raw32; + u16 raw16; + u8 *data; + u8 byte; + + data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4; + + /* hop_lim and node_id */ + if (trace->type.bit0) { + byte = ipv6_hdr(skb)->hop_limit; + if (skb->dev) + byte--; + + raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + + *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); + data += sizeof(__be32); + } + + /* ingress_if_id and egress_if_id */ + if (trace->type.bit1) { + if (!skb->dev) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + } + + /* timestamp seconds */ + if (trace->type.bit2) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + skb_get_new_timestamp(skb, &ts); + *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); + } + data += sizeof(__be32); + } + + /* timestamp subseconds */ + if (trace->type.bit3) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + if (!trace->type.bit2) + skb_get_new_timestamp(skb, &ts); + + *(__be32 *)data = cpu_to_be32((u32)ts.tv_usec); + } + data += sizeof(__be32); + } + + /* transit delay */ + if (trace->type.bit4) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* namespace data */ + if (trace->type.bit5) { + *(__be32 *)data = ns->data; + data += sizeof(__be32); + } + + /* queue depth */ + if (trace->type.bit6) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* checksum complement */ + if (trace->type.bit7) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* hop_lim and node_id (wide) */ + if (trace->type.bit8) { + byte = ipv6_hdr(skb)->hop_limit; + if (skb->dev) + byte--; + + raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + + *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); + data += sizeof(__be64); + } + + /* ingress_if_id and egress_if_id (wide) */ + if (trace->type.bit9) { + if (!skb->dev) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + } + + /* namespace data (wide) */ + if (trace->type.bit10) { + *(__be64 *)data = ns->data_wide; + data += sizeof(__be64); + } + + /* buffer occupancy */ + if (trace->type.bit11) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* opaque state snapshot */ + if (trace->type.bit22) { + if (!sc) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8); + } else { + *(__be32 *)data = sc->hdr; + data += sizeof(__be32); + + memcpy(data, sc->data, sc->len); + } + } +} + +/* called with rcu_read_lock() */ +void ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace) +{ + struct ioam6_schema *sc; + u8 sclen = 0; + + /* Skip if Overflow flag is set OR + * if an unknown type (bit 12-21) is set + */ + if (trace->overflow || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) { + return; + } + + /* NodeLen does not include Opaque State Snapshot length. We need to + * take it into account if the corresponding bit is set (bit 22) and + * if the current IOAM namespace has an active schema attached to it + */ + sc = rcu_dereference(ns->schema); + if (trace->type.bit22) { + sclen = sizeof_field(struct ioam6_schema, hdr) / 4; + + if (sc) + sclen += sc->len / 4; + } + + /* If there is no space remaining, we set the Overflow flag and we + * skip without filling the trace + */ + if (!trace->remlen || trace->remlen < trace->nodelen + sclen) { + trace->overflow = 1; + return; + } + + __ioam6_fill_trace_data(skb, ns, trace, sc, sclen); + trace->remlen -= trace->nodelen + sclen; +} + +static int __net_init ioam6_net_init(struct net *net) +{ + struct ioam6_pernet_data *nsdata; + int err = -ENOMEM; + + nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL); + if (!nsdata) + goto out; + + mutex_init(&nsdata->lock); + net->ipv6.ioam6_data = nsdata; + + err = rhashtable_init(&nsdata->namespaces, &rht_ns_params); + if (err) + goto free_nsdata; + + err = rhashtable_init(&nsdata->schemas, &rht_sc_params); + if (err) + goto free_rht_ns; + +out: + return err; +free_rht_ns: + rhashtable_destroy(&nsdata->namespaces); +free_nsdata: + kfree(nsdata); + net->ipv6.ioam6_data = NULL; + goto out; +} + +static void __net_exit ioam6_net_exit(struct net *net) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL); + rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL); + + kfree(nsdata); +} + +static struct pernet_operations ioam6_net_ops = { + .init = ioam6_net_init, + .exit = ioam6_net_exit, +}; + +int __init ioam6_init(void) +{ + int err = register_pernet_subsys(&ioam6_net_ops); + if (err) + goto out; + + err = genl_register_family(&ioam6_genl_family); + if (err) + goto out_unregister_pernet_subsys; + +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + err = ioam6_iptunnel_init(); + if (err) + goto out_unregister_genl; +#endif + + pr_info("In-situ OAM (IOAM) with IPv6\n"); + +out: + return err; +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL +out_unregister_genl: + genl_unregister_family(&ioam6_genl_family); +#endif +out_unregister_pernet_subsys: + unregister_pernet_subsys(&ioam6_net_ops); + goto out; +} + +void ioam6_exit(void) +{ +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + ioam6_iptunnel_exit(); +#endif + genl_unregister_family(&ioam6_genl_family); + unregister_pernet_subsys(&ioam6_net_ops); +} diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c new file mode 100644 index 000000000000..f9ee04541c17 --- /dev/null +++ b/net/ipv6/ioam6_iptunnel.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM Lightweight Tunnel implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/netlink.h> +#include <linux/in6.h> +#include <linux/ioam6.h> +#include <linux/ioam6_iptunnel.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/lwtunnel.h> +#include <net/ioam6.h> + +#define IOAM6_MASK_SHORT_FIELDS 0xff100000 +#define IOAM6_MASK_WIDE_FIELDS 0xe00000 + +struct ioam6_lwt_encap { + struct ipv6_hopopt_hdr eh; + u8 pad[2]; /* 2-octet padding for 4n-alignment */ + struct ioam6_hdr ioamh; + struct ioam6_trace_hdr traceh; +} __packed; + +struct ioam6_lwt { + struct ioam6_lwt_encap tuninfo; +}; + +static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) +{ + return (struct ioam6_lwt *)lwt->data; +} + +static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) +{ + return &ioam6_lwt_state(lwt)->tuninfo; +} + +static struct ioam6_trace_hdr *ioam6_trace(struct lwtunnel_state *lwt) +{ + return &(ioam6_lwt_state(lwt)->tuninfo.traceh); +} + +static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), +}; + +static int nla_put_ioam6_trace(struct sk_buff *skb, int attrtype, + struct ioam6_trace_hdr *trace) +{ + struct ioam6_trace_hdr *data; + struct nlattr *nla; + int len; + + len = sizeof(*trace); + + nla = nla_reserve(skb, attrtype, len); + if (!nla) + return -EMSGSIZE; + + data = nla_data(nla); + memcpy(data, trace, len); + + return 0; +} + +static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) +{ + u32 fields; + + if (!trace->type_be32 || !trace->remlen || + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4) + return false; + + trace->nodelen = 0; + fields = be32_to_cpu(trace->type_be32); + + trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) + * (sizeof(__be32) / 4); + trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) + * (sizeof(__be64) / 4); + + return true; +} + +static int ioam6_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; + struct ioam6_lwt_encap *tuninfo; + struct ioam6_trace_hdr *trace; + struct lwtunnel_state *s; + int len_aligned; + int len, err; + + if (family != AF_INET6) + return -EINVAL; + + err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, + ioam6_iptunnel_policy, extack); + if (err < 0) + return err; + + if (!tb[IOAM6_IPTUNNEL_TRACE]) { + NL_SET_ERR_MSG(extack, "missing trace"); + return -EINVAL; + } + + trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]); + if (!ioam6_validate_trace_hdr(trace)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], + "invalid trace validation"); + return -EINVAL; + } + + len = sizeof(*tuninfo) + trace->remlen * 4; + len_aligned = ALIGN(len, 8); + + s = lwtunnel_state_alloc(len_aligned); + if (!s) + return -ENOMEM; + + tuninfo = ioam6_lwt_info(s); + tuninfo->eh.hdrlen = (len_aligned >> 3) - 1; + tuninfo->pad[0] = IPV6_TLV_PADN; + tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; + tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; + tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) + + trace->remlen * 4; + + memcpy(&tuninfo->traceh, trace, sizeof(*trace)); + + len = len_aligned - len; + if (len == 1) { + tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PAD1; + } else if (len > 0) { + tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; + tuninfo->traceh.data[trace->remlen * 4 + 1] = len - 2; + } + + s->type = LWTUNNEL_ENCAP_IOAM6; + s->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = s; + + return 0; +} + +static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo) +{ + struct ioam6_trace_hdr *trace; + struct ipv6hdr *oldhdr, *hdr; + struct ioam6_namespace *ns; + int hdrlen, err; + + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + + err = skb_cow_head(skb, hdrlen + skb->mac_len); + if (unlikely(err)) + return err; + + oldhdr = ipv6_hdr(skb); + skb_pull(skb, sizeof(*oldhdr)); + skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr)); + + skb_push(skb, sizeof(*oldhdr) + hdrlen); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + hdr = ipv6_hdr(skb); + memmove(hdr, oldhdr, sizeof(*oldhdr)); + tuninfo->eh.nexthdr = hdr->nexthdr; + + skb_set_transport_header(skb, sizeof(*hdr)); + skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen); + + memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); + + hdr->nexthdr = NEXTHDR_HOP; + hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); + + trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) + + sizeof(struct ipv6_hopopt_hdr) + 2 + + sizeof(struct ioam6_hdr)); + + ns = ioam6_namespace(dev_net(skb_dst(skb)->dev), trace->namespace_id); + if (ns) + ioam6_fill_trace_data(skb, ns, trace); + + return 0; +} + +static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct lwtunnel_state *lwt = skb_dst(skb)->lwtstate; + int err = -EINVAL; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + /* Only for packets we send and + * that do not contain a Hop-by-Hop yet + */ + if (skb->dev || ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) + goto out; + + err = ioam6_do_inline(skb, ioam6_lwt_info(lwt)); + if (unlikely(err)) + goto drop; + + err = skb_cow_head(skb, LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + goto drop; + +out: + return lwt->orig_output(net, sk, skb); + +drop: + kfree_skb(skb); + return err; +} + +static int ioam6_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + + if (nla_put_ioam6_trace(skb, IOAM6_IPTUNNEL_TRACE, trace)) + return -EMSGSIZE; + + return 0; +} + +static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + + return nla_total_size(sizeof(*trace)); +} + +static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct ioam6_trace_hdr *a_hdr = ioam6_trace(a); + struct ioam6_trace_hdr *b_hdr = ioam6_trace(b); + + return (a_hdr->namespace_id != b_hdr->namespace_id); +} + +static const struct lwtunnel_encap_ops ioam6_iptun_ops = { + .build_state = ioam6_build_state, + .output = ioam6_output, + .fill_encap = ioam6_fill_encap_info, + .get_encap_size = ioam6_encap_nlsize, + .cmp_encap = ioam6_encap_cmp, + .owner = THIS_MODULE, +}; + +int __init ioam6_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} + +void ioam6_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2d650dc24349..a8f118e469b7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2449,8 +2449,8 @@ int __init fib6_init(void) int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fib6_node), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index bc224f917bbd..3ad201d372d8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1244,8 +1244,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, memcpy(u->name, p->name, sizeof(u->name)); } -static int ip6gre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ip6gre_tunnel_siocdevprivate(struct net_device *dev, + struct ifreq *ifr, void __user *data, + int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -1259,7 +1260,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1270,7 +1271,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; @@ -1281,7 +1282,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, goto done; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -EINVAL; @@ -1318,7 +1319,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); @@ -1331,7 +1332,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, if (dev == ign->fb_tunnel_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -ENOENT; ip6gre_tnl_parm_from_user(&p1, &p); @@ -1398,7 +1399,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_init = ip6gre_tunnel_init, .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, - .ndo_do_ioctl = ip6gre_tunnel_ioctl, + .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e6ca9ad6812..d72347c75f8b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -608,7 +608,7 @@ int ip6_forward(struct sk_buff *skb) } } - mtu = ip6_dst_mtu_forward(dst); + mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 322698d9fcf4..20a67efda47f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1581,9 +1581,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) } /** - * ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: @@ -1609,7 +1610,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) **/ static int -ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; @@ -1623,7 +1625,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1635,9 +1637,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; - } break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: @@ -1645,7 +1646,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && @@ -1669,7 +1670,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { @@ -1683,7 +1684,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); @@ -1802,7 +1803,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 2d048e21abbb..1d8e3ffa225d 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -771,13 +771,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) } /** - * vti6_ioctl - configure vti6 tunnels from userspace + * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: - * vti6_ioctl() is used for managing vti6 tunnels + * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: @@ -798,7 +799,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) * %-ENODEV if attempting to change or delete a nonexisting device **/ static int -vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -810,7 +811,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -822,7 +823,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: @@ -831,7 +832,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) @@ -852,7 +853,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else @@ -865,7 +866,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); @@ -890,7 +891,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, - .ndo_do_ioctl = vti6_ioctl, + .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b6ddf23d3833..6b8051106aba 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3201,25 +3201,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { - struct inet6_dev *idev; - unsigned int mtu; - - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - goto out; - - mtu = IPV6_MIN_MTU; - - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - -out: - mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); @@ -6638,7 +6620,7 @@ int __init ip6_route_init(void) ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index df5bea818410..ef0c7a7c18e2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -299,9 +299,8 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { - struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data; struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; @@ -321,7 +320,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) : + kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; rcu_read_lock(); @@ -334,7 +333,8 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * For root users, retry allocating enough memory for * the answer. */ - kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | + __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; @@ -453,8 +453,8 @@ out: return err; } -static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, - int cmd) +static int ipip6_tunnel_prl_ctl(struct net_device *dev, + struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; @@ -465,7 +465,7 @@ static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { @@ -1197,14 +1197,14 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } static int -ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1215,13 +1215,14 @@ ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd))) + if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int -ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, + int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; @@ -1229,7 +1230,7 @@ ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd))) + if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { @@ -1368,27 +1369,28 @@ ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } static int -ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: - return ip_tunnel_ioctl(dev, ifr, cmd); + return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: - return ipip6_tunnel_get_prl(dev, ifr); + return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: - return ipip6_tunnel_prl_ctl(dev, ifr, cmd); + return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: - return ipip6_tunnel_get6rd(dev, ifr); + return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: - return ipip6_tunnel_6rdctl(dev, ifr, cmd); + return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; @@ -1399,7 +1401,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, - .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index d7cf26f730d7..d53dd142bf87 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -21,6 +21,7 @@ #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif +#include <linux/ioam6.h> static int two = 2; static int three = 3; @@ -28,6 +29,8 @@ static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; +static u32 ioam6_id_max = IOAM6_DEFAULT_ID; +static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -196,6 +199,22 @@ static struct ctl_table ipv6_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "ioam6_id", + .data = &init_net.ipv6.sysctl.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = &ioam6_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &init_net.ipv6.sysctl.ioam6_id_wide, + .maxlen = sizeof(u64), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra2 = &ioam6_id_wide_max, + }, { } }; diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig new file mode 100644 index 000000000000..2cdf3d0a28c9 --- /dev/null +++ b/net/mctp/Kconfig @@ -0,0 +1,13 @@ + +menuconfig MCTP + depends on NET + tristate "MCTP core protocol support" + help + Management Component Transport Protocol (MCTP) is an in-system + protocol for communicating between management controllers and + their managed devices (peripherals, host processors, etc.). The + protocol is defined by DMTF specification DSP0236. + + This option enables core MCTP support. For communicating with other + devices, you'll want to enable a driver for a specific hardware + channel. diff --git a/net/mctp/Makefile b/net/mctp/Makefile new file mode 100644 index 000000000000..0171333384d7 --- /dev/null +++ b/net/mctp/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MCTP) += mctp.o +mctp-objs := af_mctp.o device.o route.o neigh.o diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c new file mode 100644 index 000000000000..84f722d31fd7 --- /dev/null +++ b/net/mctp/af_mctp.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_arp.h> +#include <linux/net.h> +#include <linux/mctp.h> +#include <linux/module.h> +#include <linux/socket.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +/* socket implementation */ + +static int mctp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + sk->sk_prot->close(sk, 0); + } + + return 0; +} + +static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +{ + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct sockaddr_mctp *smctp; + int rc; + + if (addrlen < sizeof(*smctp)) + return -EINVAL; + + if (addr->sa_family != AF_MCTP) + return -EAFNOSUPPORT; + + if (!capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + /* it's a valid sockaddr for MCTP, cast and do protocol checks */ + smctp = (struct sockaddr_mctp *)addr; + + lock_sock(sk); + + /* TODO: allow rebind */ + if (sk_hashed(sk)) { + rc = -EADDRINUSE; + goto out_release; + } + msk->bind_net = smctp->smctp_network; + msk->bind_addr = smctp->smctp_addr.s_addr; + msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */ + + rc = sk->sk_prot->hash(sk); + +out_release: + release_sock(sk); + + return rc; +} + +static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr); + int rc, addrlen = msg->msg_namelen; + struct sock *sk = sock->sk; + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct sk_buff *skb; + + if (addr) { + if (addrlen < sizeof(struct sockaddr_mctp)) + return -EINVAL; + if (addr->smctp_family != AF_MCTP) + return -EINVAL; + if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) + return -EINVAL; + + } else { + /* TODO: connect()ed sockets */ + return -EDESTADDRREQ; + } + + if (!capable(CAP_NET_RAW)) + return -EACCES; + + if (addr->smctp_network == MCTP_NET_ANY) + addr->smctp_network = mctp_default_net(sock_net(sk)); + + rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr); + if (!rt) + return -EHOSTUNREACH; + + skb = sock_alloc_send_skb(sk, hlen + 1 + len, + msg->msg_flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + skb_reserve(skb, hlen); + + /* set type as fist byte in payload */ + *(u8 *)skb_put(skb, 1) = addr->smctp_type; + + rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); + if (rc < 0) { + kfree_skb(skb); + return rc; + } + + /* set up cb */ + cb = __mctp_cb(skb); + cb->net = addr->smctp_network; + + rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, + addr->smctp_tag); + + return rc ? : len; +} + +static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t msglen; + u8 type; + int rc; + + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) + return -EOPNOTSUPP; + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + if (!skb->len) { + rc = 0; + goto out_free; + } + + /* extract message type, remove from data */ + type = *((u8 *)skb->data); + msglen = skb->len - 1; + + if (len < msglen) + msg->msg_flags |= MSG_TRUNC; + else + len = msglen; + + rc = skb_copy_datagram_msg(skb, 1, msg, len); + if (rc < 0) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (addr) { + struct mctp_skb_cb *cb = mctp_cb(skb); + /* TODO: expand mctp_skb_cb for header fields? */ + struct mctp_hdr *hdr = mctp_hdr(skb); + + hdr = mctp_hdr(skb); + addr = msg->msg_name; + addr->smctp_family = AF_MCTP; + addr->smctp_network = cb->net; + addr->smctp_addr.s_addr = hdr->src; + addr->smctp_type = type; + addr->smctp_tag = hdr->flags_seq_tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + msg->msg_namelen = sizeof(*addr); + } + + rc = len; + + if (flags & MSG_TRUNC) + rc = msglen; + +out_free: + skb_free_datagram(sk, skb); + return rc; +} + +static int mctp_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return -EINVAL; +} + +static int mctp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EINVAL; +} + +static const struct proto_ops mctp_dgram_ops = { + .family = PF_MCTP, + .release = mctp_release, + .bind = mctp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = mctp_setsockopt, + .getsockopt = mctp_getsockopt, + .sendmsg = mctp_sendmsg, + .recvmsg = mctp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int mctp_sk_init(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + INIT_HLIST_HEAD(&msk->keys); + return 0; +} + +static void mctp_sk_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +static int mctp_sk_hash(struct sock *sk) +{ + struct net *net = sock_net(sk); + + mutex_lock(&net->mctp.bind_lock); + sk_add_node_rcu(sk, &net->mctp.binds); + mutex_unlock(&net->mctp.bind_lock); + + return 0; +} + +static void mctp_sk_unhash(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(sk); + struct mctp_sk_key *key; + struct hlist_node *tmp; + unsigned long flags; + + /* remove from any type-based binds */ + mutex_lock(&net->mctp.bind_lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->mctp.bind_lock); + + /* remove tag allocations */ + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + hlist_del_rcu(&key->sklist); + hlist_del_rcu(&key->hlist); + + spin_lock(&key->reasm_lock); + if (key->reasm_head) + kfree_skb(key->reasm_head); + key->reasm_head = NULL; + key->reasm_dead = true; + spin_unlock(&key->reasm_lock); + + kfree_rcu(key, rcu); + } + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + synchronize_rcu(); +} + +static struct proto mctp_proto = { + .name = "MCTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct mctp_sock), + .init = mctp_sk_init, + .close = mctp_sk_close, + .hash = mctp_sk_hash, + .unhash = mctp_sk_unhash, +}; + +static int mctp_pf_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + const struct proto_ops *ops; + struct proto *proto; + struct sock *sk; + int rc; + + if (protocol) + return -EPROTONOSUPPORT; + + /* only datagram sockets are supported */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + proto = &mctp_proto; + ops = &mctp_dgram_ops; + + sock->state = SS_UNCONNECTED; + sock->ops = ops; + + sk = sk_alloc(net, PF_MCTP, GFP_KERNEL, proto, kern); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + rc = 0; + if (sk->sk_prot->init) + rc = sk->sk_prot->init(sk); + + if (rc) + goto err_sk_put; + + return 0; + +err_sk_put: + sock_orphan(sk); + sock_put(sk); + return rc; +} + +static struct net_proto_family mctp_pf = { + .family = PF_MCTP, + .create = mctp_pf_create, + .owner = THIS_MODULE, +}; + +static __init int mctp_init(void) +{ + int rc; + + /* ensure our uapi tag definitions match the header format */ + BUILD_BUG_ON(MCTP_TAG_OWNER != MCTP_HDR_FLAG_TO); + BUILD_BUG_ON(MCTP_TAG_MASK != MCTP_HDR_TAG_MASK); + + pr_info("mctp: management component transport protocol core\n"); + + rc = sock_register(&mctp_pf); + if (rc) + return rc; + + rc = proto_register(&mctp_proto, 0); + if (rc) + goto err_unreg_sock; + + rc = mctp_routes_init(); + if (rc) + goto err_unreg_proto; + + rc = mctp_neigh_init(); + if (rc) + goto err_unreg_proto; + + mctp_device_init(); + + return 0; + +err_unreg_proto: + proto_unregister(&mctp_proto); +err_unreg_sock: + sock_unregister(PF_MCTP); + + return rc; +} + +static __exit void mctp_exit(void) +{ + mctp_device_exit(); + mctp_neigh_exit(); + mctp_routes_exit(); + proto_unregister(&mctp_proto); + sock_unregister(PF_MCTP); +} + +module_init(mctp_init); +module_exit(mctp_exit); + +MODULE_DESCRIPTION("MCTP core"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>"); + +MODULE_ALIAS_NETPROTO(PF_MCTP); diff --git a/net/mctp/device.c b/net/mctp/device.c new file mode 100644 index 000000000000..b9f38e765f61 --- /dev/null +++ b/net/mctp/device.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - device implementation. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_link.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> + +#include <net/addrconf.h> +#include <net/netlink.h> +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +struct mctp_dump_cb { + int h; + int idx; + size_t a_idx; +}; + +/* unlocked: caller must hold rcu_read_lock */ +struct mctp_dev *__mctp_dev_get(const struct net_device *dev) +{ + return rcu_dereference(dev->mctp_ptr); +} + +struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) +{ + return rtnl_dereference(dev->mctp_ptr); +} + +static void mctp_dev_destroy(struct mctp_dev *mdev) +{ + struct net_device *dev = mdev->dev; + + dev_put(dev); + kfree_rcu(mdev, rcu); +} + +static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, + struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct ifaddrmsg *hdr; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifa_family = AF_MCTP; + hdr->ifa_prefixlen = 0; + hdr->ifa_flags = 0; + hdr->ifa_scope = 0; + hdr->ifa_index = mdev->dev->ifindex; + + if (nla_put_u8(skb, IFA_LOCAL, eid)) + goto cancel; + + if (nla_put_u8(skb, IFA_ADDRESS, eid)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + int rc = 0; + + for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) { + rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]); + if (rc < 0) + break; + } + + return rc; +} + +static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + struct net_device *dev; + struct ifaddrmsg *hdr; + struct mctp_dev *mdev; + int ifindex; + int idx, rc; + + hdr = nlmsg_data(cb->nlh); + // filter by ifindex if requested + ifindex = hdr->ifa_index; + + rcu_read_lock(); + for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { + idx = 0; + head = &net->dev_index_head[mcb->h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx >= mcb->idx && + (ifindex == 0 || ifindex == dev->ifindex)) { + mdev = __mctp_dev_get(dev); + if (mdev) { + rc = mctp_dump_dev_addrinfo(mdev, + skb, cb); + // Error indicates full buffer, this + // callback will get retried. + if (rc < 0) + goto out; + } + } + idx++; + // reset for next iteration + mcb->a_idx = 0; + } + } +out: + rcu_read_unlock(); + mcb->idx = idx; + + return skb->len; +} + +static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = { + [IFA_ADDRESS] = { .type = NLA_U8 }, + [IFA_LOCAL] = { .type = NLA_U8 }, +}; + +static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *tmp_addrs; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (!mctp_address_ok(addr->s_addr)) + return -EINVAL; + + /* Prevent duplicates. Under RTNL so don't need to lock for reading */ + if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs)) + return -EEXIST; + + tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL); + if (!tmp_addrs) + return -ENOMEM; + memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs); + tmp_addrs[mdev->num_addrs] = addr->s_addr; + + /* Lock to write */ + spin_lock_irqsave(&mdev->addrs_lock, flags); + mdev->num_addrs++; + swap(mdev->addrs, tmp_addrs); + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + kfree(tmp_addrs); + + mctp_route_add_local(mdev, addr->s_addr); + + return 0; +} + +static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *pos; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs); + if (!pos) + return -ENOENT; + + rc = mctp_route_remove_local(mdev, addr->s_addr); + // we can ignore -ENOENT in the case a route was already removed + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&mdev->addrs_lock, flags); + memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs)); + mdev->num_addrs--; + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + return 0; +} + +static struct mctp_dev *mctp_add_dev(struct net_device *dev) +{ + struct mctp_dev *mdev; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&mdev->addrs_lock); + + mdev->net = mctp_default_net(dev_net(dev)); + + /* associate to net_device */ + rcu_assign_pointer(dev->mctp_ptr, mdev); + dev_hold(dev); + mdev->dev = dev; + + return mdev; +} + +static int mctp_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODATA; + if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) + return -EMSGSIZE; + return 0; +} + +static size_t mctp_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + unsigned int ret; + + /* caller holds RCU */ + mdev = __mctp_dev_get(dev); + if (!mdev) + return 0; + ret = nla_total_size(4); /* IFLA_MCTP_NET */ + return ret; +} + +static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = { + [IFLA_MCTP_NET] = { .type = NLA_U32 }, +}; + +static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MCTP_MAX + 1]; + struct mctp_dev *mdev; + int rc; + + rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy, + NULL); + if (rc) + return rc; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return 0; + + if (tb[IFLA_MCTP_NET]) + WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET])); + + return 0; +} + +static void mctp_unregister(struct net_device *dev) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + + if (!mdev) + return; + + RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL); + + mctp_route_remove_dev(mdev); + mctp_neigh_remove_dev(mdev); + kfree(mdev->addrs); + + mctp_dev_destroy(mdev); +} + +static int mctp_register(struct net_device *dev) +{ + struct mctp_dev *mdev; + + /* Already registered? */ + if (rtnl_dereference(dev->mctp_ptr)) + return 0; + + /* only register specific types; MCTP-specific and loopback for now */ + if (dev->type != ARPHRD_MCTP && dev->type != ARPHRD_LOOPBACK) + return 0; + + mdev = mctp_add_dev(dev); + if (IS_ERR(mdev)) + return PTR_ERR(mdev); + + return 0; +} + +static int mctp_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int rc; + + switch (event) { + case NETDEV_REGISTER: + rc = mctp_register(dev); + if (rc) + return notifier_from_errno(rc); + break; + case NETDEV_UNREGISTER: + mctp_unregister(dev); + break; + } + + return NOTIFY_OK; +} + +static struct rtnl_af_ops mctp_af_ops = { + .family = AF_MCTP, + .fill_link_af = mctp_fill_link_af, + .get_link_af_size = mctp_get_link_af_size, + .set_link_af = mctp_set_link_af, +}; + +static struct notifier_block mctp_dev_nb = { + .notifier_call = mctp_dev_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, +}; + +void __init mctp_device_init(void) +{ + register_netdevice_notifier(&mctp_dev_nb); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR, + NULL, mctp_dump_addrinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR, + mctp_rtm_newaddr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR, + mctp_rtm_deladdr, NULL, 0); + rtnl_af_register(&mctp_af_ops); +} + +void __exit mctp_device_exit(void) +{ + rtnl_af_unregister(&mctp_af_ops); + rtnl_unregister(PF_MCTP, RTM_DELADDR); + rtnl_unregister(PF_MCTP, RTM_NEWADDR); + rtnl_unregister(PF_MCTP, RTM_GETADDR); + + unregister_netdevice_notifier(&mctp_dev_nb); +} diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c new file mode 100644 index 000000000000..90ed2f02d1fb --- /dev/null +++ b/net/mctp/neigh.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, + enum mctp_neigh_source source, + size_t lladdr_len, const void *lladdr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc; + + mutex_lock(&net->mctp.neigh_lock); + if (mctp_neigh_lookup(mdev, eid, NULL) == 0) { + rc = -EEXIST; + goto out; + } + + if (lladdr_len > sizeof(neigh->ha)) { + rc = -EINVAL; + goto out; + } + + neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); + if (!neigh) { + rc = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&neigh->list); + neigh->dev = mdev; + dev_hold(neigh->dev->dev); + neigh->eid = eid; + neigh->source = source; + memcpy(neigh->ha, lladdr, lladdr_len); + + list_add_rcu(&neigh->list, &net->mctp.neighbours); + rc = 0; +out: + mutex_unlock(&net->mctp.neigh_lock); + return rc; +} + +static void __mctp_neigh_free(struct rcu_head *rcu) +{ + struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); + + dev_put(neigh->dev->dev); + kfree(neigh); +} + +/* Removes all neighbour entries referring to a device */ +void mctp_neigh_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + } + } + + mutex_unlock(&net->mctp.neigh_lock); +} + +// TODO: add a "source" flag so netlink can only delete static neighbours? +static int mctp_neigh_remove(struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + bool dropped = false; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev && neigh->eid == eid) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + dropped = true; + } + } + + mutex_unlock(&net->mctp.neigh_lock); + return dropped ? 0 : -ENOENT; +} + +static const struct nla_policy nd_mctp_policy[NDA_MAX + 1] = { + [NDA_DST] = { .type = NLA_U8 }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, +}; + +static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX + 1]; + int rc; + mctp_eid_t eid; + void *lladdr; + int lladdr_len; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "lladdr too large?"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + + if (!tb[NDA_LLADDR]) { + NL_SET_ERR_MSG(extack, "Neighbour lladdr must be specified"); + return -EINVAL; + } + + eid = nla_get_u8(tb[NDA_DST]); + if (!mctp_address_ok(eid)) { + NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); + return -EINVAL; + } + + lladdr = nla_data(tb[NDA_LLADDR]); + lladdr_len = nla_len(tb[NDA_LLADDR]); + + ndm = nlmsg_data(nlh); + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (lladdr_len != dev->addr_len) { + NL_SET_ERR_MSG(extack, "Wrong lladdr length"); + return -EINVAL; + } + + return mctp_neigh_add(mdev, eid, MCTP_NEIGH_STATIC, + lladdr_len, lladdr); +} + +static int mctp_rtm_delneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + int rc; + mctp_eid_t eid; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + eid = nla_get_u8(tb[NDA_DST]); + + ndm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + return mctp_neigh_remove(mdev, eid); +} + +static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, + unsigned int flags, struct mctp_neigh *neigh) +{ + struct net_device *dev = neigh->dev->dev; + struct nlmsghdr *nlh; + struct ndmsg *hdr; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ndm_family = AF_MCTP; + hdr->ndm_ifindex = dev->ifindex; + hdr->ndm_state = 0; // TODO other state bits? + if (neigh->source == MCTP_NEIGH_STATIC) + hdr->ndm_state |= NUD_PERMANENT; + hdr->ndm_flags = 0; + hdr->ndm_type = RTN_UNICAST; // TODO: is loopback RTN_LOCAL? + + if (nla_put_u8(skb, NDA_DST, neigh->eid)) + goto cancel; + + if (nla_put(skb, NDA_LLADDR, dev->addr_len, neigh->ha)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int rc, idx, req_ifindex; + struct mctp_neigh *neigh; + struct ndmsg *ndmsg; + struct { + int idx; + } *cbctx = (void *)cb->ctx; + + ndmsg = nlmsg_data(cb->nlh); + req_ifindex = ndmsg->ndm_ifindex; + + idx = 0; + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (idx < cbctx->idx) + goto cont; + + rc = 0; + if (req_ifindex == 0 || req_ifindex == neigh->dev->dev->ifindex) + rc = mctp_fill_neigh(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, neigh); + + if (rc) + break; +cont: + idx++; + } + rcu_read_unlock(); + + cbctx->idx = idx; + return skb->len; +} + +int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc = -EHOSTUNREACH; // TODO: or ENOENT? + + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (mdev == neigh->dev && eid == neigh->eid) { + if (ret_hwaddr) + memcpy(ret_hwaddr, neigh->ha, + sizeof(neigh->ha)); + rc = 0; + break; + } + } + rcu_read_unlock(); + return rc; +} + +/* namespace registration */ +static int __net_init mctp_neigh_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->neighbours); + mutex_init(&ns->neigh_lock); + return 0; +} + +static void __net_exit mctp_neigh_net_exit(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + struct mctp_neigh *neigh; + + list_for_each_entry(neigh, &ns->neighbours, list) + call_rcu(&neigh->rcu, __mctp_neigh_free); +} + +/* net namespace implementation */ + +static struct pernet_operations mctp_net_ops = { + .init = mctp_neigh_net_init, + .exit = mctp_neigh_net_exit, +}; + +int __init mctp_neigh_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, + mctp_rtm_newneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELNEIGH, + mctp_rtm_delneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETNEIGH, + NULL, mctp_rtm_getneigh, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_neigh_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_GETNEIGH); + rtnl_unregister(PF_MCTP, RTM_DELNEIGH); + rtnl_unregister(PF_MCTP, RTM_NEWNEIGH); +} diff --git a/net/mctp/route.c b/net/mctp/route.c new file mode 100644 index 000000000000..b3101375c8e7 --- /dev/null +++ b/net/mctp/route.c @@ -0,0 +1,1099 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <uapi/linux/if_arp.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +static const unsigned int mctp_message_maxlen = 64 * 1024; + +/* route output callbacks */ +static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *mh; + struct sock *sk; + u8 type; + + WARN_ON(!rcu_read_lock_held()); + + /* TODO: look up in skb->cb? */ + mh = mctp_hdr(skb); + + if (!skb_headlen(skb)) + return NULL; + + type = (*(u8 *)skb->data) & 0x7f; + + sk_for_each_rcu(sk, &net->mctp.binds) { + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net) + continue; + + if (msk->bind_type != type) + continue; + + if (msk->bind_addr != MCTP_ADDR_ANY && + msk->bind_addr != mh->dest) + continue; + + return msk; + } + + return NULL; +} + +static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, + mctp_eid_t peer, u8 tag) +{ + if (key->local_addr != local) + return false; + + if (key->peer_addr != peer) + return false; + + if (key->tag != tag) + return false; + + return true; +} + +static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, + mctp_eid_t peer) +{ + struct mctp_sk_key *key, *ret; + struct mctp_hdr *mh; + u8 tag; + + WARN_ON(!rcu_read_lock_held()); + + mh = mctp_hdr(skb); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + ret = NULL; + + hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) { + if (mctp_key_match(key, mh->dest, peer, tag)) { + ret = key; + break; + } + } + + return ret; +} + +static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, + mctp_eid_t local, mctp_eid_t peer, + u8 tag, gfp_t gfp) +{ + struct mctp_sk_key *key; + + key = kzalloc(sizeof(*key), gfp); + if (!key) + return NULL; + + key->peer_addr = peer; + key->local_addr = local; + key->tag = tag; + key->sk = &msk->sk; + spin_lock_init(&key->reasm_lock); + + return key; +} + +static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) +{ + struct net *net = sock_net(&msk->sk); + struct mctp_sk_key *tmp; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { + if (mctp_key_match(tmp, key->local_addr, key->peer_addr, + key->tag)) { + rc = -EEXIST; + break; + } + } + + if (!rc) { + hlist_add_head(&key->hlist, &net->mctp.keys); + hlist_add_head(&key->sklist, &msk->keys); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + return rc; +} + +/* Must be called with key->reasm_lock, which it will release. Will schedule + * the key for an RCU free. + */ +static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, + unsigned long flags) + __releases(&key->reasm_lock) +{ + struct sk_buff *skb; + + skb = key->reasm_head; + key->reasm_head = NULL; + key->reasm_dead = true; + spin_unlock_irqrestore(&key->reasm_lock, flags); + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_del_rcu(&key->hlist); + hlist_del_rcu(&key->sklist); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + kfree_rcu(key, rcu); + + if (skb) + kfree_skb(skb); +} + +static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) +{ + struct mctp_hdr *hdr = mctp_hdr(skb); + u8 exp_seq, this_seq; + + this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) + & MCTP_HDR_SEQ_MASK; + + if (!key->reasm_head) { + key->reasm_head = skb; + key->reasm_tailp = &(skb_shinfo(skb)->frag_list); + key->last_seq = this_seq; + return 0; + } + + exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; + + if (this_seq != exp_seq) + return -EINVAL; + + if (key->reasm_head->len + skb->len > mctp_message_maxlen) + return -EINVAL; + + skb->next = NULL; + skb->sk = NULL; + *key->reasm_tailp = skb; + key->reasm_tailp = &skb->next; + + key->last_seq = this_seq; + + key->reasm_head->data_len += skb->len; + key->reasm_head->len += skb->len; + key->reasm_head->truesize += skb->truesize; + + return 0; +} + +static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct mctp_sk_key *key; + struct mctp_sock *msk; + struct mctp_hdr *mh; + unsigned long f; + u8 tag, flags; + int rc; + + msk = NULL; + rc = -EINVAL; + + /* we may be receiving a locally-routed packet; drop source sk + * accounting + */ + skb_orphan(skb); + + /* ensure we have enough data for a header and a type */ + if (skb->len < sizeof(struct mctp_hdr) + 1) + goto out; + + /* grab header, advance data ptr */ + mh = mctp_hdr(skb); + skb_pull(skb, sizeof(struct mctp_hdr)); + + if (mh->ver != 1) + goto out; + + flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + rcu_read_lock(); + + /* lookup socket / reasm context, exactly matching (src,dest,tag) */ + key = mctp_lookup_key(net, skb, mh->src); + + if (flags & MCTP_HDR_FLAG_SOM) { + if (key) { + msk = container_of(key->sk, struct mctp_sock, sk); + } else { + /* first response to a broadcast? do a more general + * key lookup to find the socket, but don't use this + * key for reassembly - we'll create a more specific + * one for future packets if required (ie, !EOM). + */ + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); + if (key) { + msk = container_of(key->sk, + struct mctp_sock, sk); + key = NULL; + } + } + + if (!key && !msk && (tag & MCTP_HDR_FLAG_TO)) + msk = mctp_lookup_bind(net, skb); + + if (!msk) { + rc = -ENOENT; + goto out_unlock; + } + + /* single-packet message? deliver to socket, clean up any + * pending key. + */ + if (flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(&msk->sk, skb); + if (key) { + spin_lock_irqsave(&key->reasm_lock, f); + /* we've hit a pending reassembly; not much we + * can do but drop it + */ + __mctp_key_unlock_drop(key, net, f); + } + rc = 0; + goto out_unlock; + } + + /* broadcast response or a bind() - create a key for further + * packets for this message + */ + if (!key) { + key = mctp_key_alloc(msk, mh->dest, mh->src, + tag, GFP_ATOMIC); + if (!key) { + rc = -ENOMEM; + goto out_unlock; + } + + /* we can queue without the reasm lock here, as the + * key isn't observable yet + */ + mctp_frag_queue(key, skb); + + /* if the key_add fails, we've raced with another + * SOM packet with the same src, dest and tag. There's + * no way to distinguish future packets, so all we + * can do is drop; we'll free the skb on exit from + * this function. + */ + rc = mctp_key_add(key, msk); + if (rc) + kfree(key); + + } else { + /* existing key: start reassembly */ + spin_lock_irqsave(&key->reasm_lock, f); + + if (key->reasm_head || key->reasm_dead) { + /* duplicate start? drop everything */ + __mctp_key_unlock_drop(key, net, f); + rc = -EEXIST; + } else { + rc = mctp_frag_queue(key, skb); + spin_unlock_irqrestore(&key->reasm_lock, f); + } + } + + } else if (key) { + /* this packet continues a previous message; reassemble + * using the message-specific key + */ + + spin_lock_irqsave(&key->reasm_lock, f); + + /* we need to be continuing an existing reassembly... */ + if (!key->reasm_head) + rc = -EINVAL; + else + rc = mctp_frag_queue(key, skb); + + /* end of message? deliver to socket, and we're done with + * the reassembly/response key + */ + if (!rc && flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(key->sk, key->reasm_head); + key->reasm_head = NULL; + __mctp_key_unlock_drop(key, net, f); + } else { + spin_unlock_irqrestore(&key->reasm_lock, f); + } + + } else { + /* not a start, no matching key */ + rc = -ENOENT; + } + +out_unlock: + rcu_read_unlock(); +out: + if (rc) + kfree_skb(skb); + return rc; +} + +static unsigned int mctp_route_mtu(struct mctp_route *rt) +{ + return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); +} + +static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) +{ + struct mctp_hdr *hdr = mctp_hdr(skb); + char daddr_buf[MAX_ADDR_LEN]; + char *daddr = NULL; + unsigned int mtu; + int rc; + + skb->protocol = htons(ETH_P_MCTP); + + mtu = READ_ONCE(skb->dev->mtu); + if (skb->len > mtu) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* If lookup fails let the device handle daddr==NULL */ + if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + daddr = daddr_buf; + + rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + daddr, skb->dev->dev_addr, skb->len); + if (rc) { + kfree_skb(skb); + return -EHOSTUNREACH; + } + + rc = dev_queue_xmit(skb); + if (rc) + rc = net_xmit_errno(rc); + + return rc; +} + +/* route alloc/release */ +static void mctp_route_release(struct mctp_route *rt) +{ + if (refcount_dec_and_test(&rt->refs)) { + dev_put(rt->dev->dev); + kfree_rcu(rt, rcu); + } +} + +/* returns a route with the refcount at 1 */ +static struct mctp_route *mctp_route_alloc(void) +{ + struct mctp_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->list); + refcount_set(&rt->refs, 1); + rt->output = mctp_route_discard; + + return rt; +} + +unsigned int mctp_default_net(struct net *net) +{ + return READ_ONCE(net->mctp.default_net); +} + +int mctp_default_net_set(struct net *net, unsigned int index) +{ + if (index == 0) + return -EINVAL; + WRITE_ONCE(net->mctp.default_net, index); + return 0; +} + +/* tag management */ +static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, + struct mctp_sock *msk) +{ + struct netns_mctp *mns = &net->mctp; + + lockdep_assert_held(&mns->keys_lock); + + /* we hold the net->key_lock here, allowing updates to both + * then net and sk + */ + hlist_add_head_rcu(&key->hlist, &mns->keys); + hlist_add_head_rcu(&key->sklist, &msk->keys); +} + +/* Allocate a locally-owned tag value for (saddr, daddr), and reserve + * it for the socket msk + */ +static int mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp) +{ + struct net *net = sock_net(&msk->sk); + struct netns_mctp *mns = &net->mctp; + struct mctp_sk_key *key, *tmp; + unsigned long flags; + int rc = -EAGAIN; + u8 tagbits; + + /* be optimistic, alloc now */ + key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); + if (!key) + return -ENOMEM; + + /* 8 possible tag values */ + tagbits = 0xff; + + spin_lock_irqsave(&mns->keys_lock, flags); + + /* Walk through the existing keys, looking for potential conflicting + * tags. If we find a conflict, clear that bit from tagbits + */ + hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* if we don't own the tag, it can't conflict */ + if (tmp->tag & MCTP_HDR_FLAG_TO) + continue; + + if ((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr) + tagbits &= ~(1 << tmp->tag); + + if (!tagbits) + break; + } + + if (tagbits) { + key->tag = __ffs(tagbits); + mctp_reserve_tag(net, key, msk); + *tagp = key->tag; + rc = 0; + } + + spin_unlock_irqrestore(&mns->keys_lock, flags); + + if (!tagbits) + kfree(key); + + return rc; +} + +/* routing lookups */ +static bool mctp_rt_match_eid(struct mctp_route *rt, + unsigned int net, mctp_eid_t eid) +{ + return READ_ONCE(rt->dev->net) == net && + rt->min <= eid && rt->max >= eid; +} + +/* compares match, used for duplicate prevention */ +static bool mctp_rt_compare_exact(struct mctp_route *rt1, + struct mctp_route *rt2) +{ + ASSERT_RTNL(); + return rt1->dev->net == rt2->dev->net && + rt1->min == rt2->min && + rt1->max == rt2->max; +} + +struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *tmp, *rt = NULL; + + list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { + /* TODO: add metrics */ + if (mctp_rt_match_eid(tmp, dnet, daddr)) { + if (refcount_inc_not_zero(&tmp->refs)) { + rt = tmp; + break; + } + } + } + + return rt; +} + +/* sends a skb to rt and releases the route. */ +int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb) +{ + int rc; + + rc = rt->output(rt, skb); + mctp_route_release(rt); + return rc; +} + +static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, + unsigned int mtu, u8 tag) +{ + const unsigned int hlen = sizeof(struct mctp_hdr); + struct mctp_hdr *hdr, *hdr2; + unsigned int pos, size; + struct sk_buff *skb2; + int rc; + u8 seq; + + hdr = mctp_hdr(skb); + seq = 0; + rc = 0; + + if (mtu < hlen + 1) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* we've got the header */ + skb_pull(skb, hlen); + + for (pos = 0; pos < skb->len;) { + /* size of message payload */ + size = min(mtu - hlen, skb->len - pos); + + skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL); + if (!skb2) { + rc = -ENOMEM; + break; + } + + /* generic skb copy */ + skb2->protocol = skb->protocol; + skb2->priority = skb->priority; + skb2->dev = skb->dev; + memcpy(skb2->cb, skb->cb, sizeof(skb2->cb)); + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* establish packet */ + skb_reserve(skb2, MCTP_HEADER_MAXLEN); + skb_reset_network_header(skb2); + skb_put(skb2, hlen + size); + skb2->transport_header = skb2->network_header + hlen; + + /* copy header fields, calculate SOM/EOM flags & seq */ + hdr2 = mctp_hdr(skb2); + hdr2->ver = hdr->ver; + hdr2->dest = hdr->dest; + hdr2->src = hdr->src; + hdr2->flags_seq_tag = tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + if (pos == 0) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM; + + if (pos + size == skb->len) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM; + + hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT; + + /* copy message payload */ + skb_copy_bits(skb, pos, skb_transport_header(skb2), size); + + /* do route, but don't drop the rt reference */ + rc = rt->output(rt, skb2); + if (rc) + break; + + seq = (seq + 1) & MCTP_HDR_SEQ_MASK; + pos += size; + } + + mctp_route_release(rt); + consume_skb(skb); + return rc; +} + +int mctp_local_output(struct sock *sk, struct mctp_route *rt, + struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *hdr; + unsigned long flags; + unsigned int mtu; + mctp_eid_t saddr; + int rc; + u8 tag; + + if (WARN_ON(!rt->dev)) + return -EINVAL; + + spin_lock_irqsave(&rt->dev->addrs_lock, flags); + if (rt->dev->num_addrs == 0) { + rc = -EHOSTUNREACH; + } else { + /* use the outbound interface's first address as our source */ + saddr = rt->dev->addrs[0]; + rc = 0; + } + spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); + + if (rc) + return rc; + + if (req_tag & MCTP_HDR_FLAG_TO) { + rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag); + if (rc) + return rc; + tag |= MCTP_HDR_FLAG_TO; + } else { + tag = req_tag; + } + + + skb->protocol = htons(ETH_P_MCTP); + skb->priority = 0; + skb_reset_transport_header(skb); + skb_push(skb, sizeof(struct mctp_hdr)); + skb_reset_network_header(skb); + skb->dev = rt->dev->dev; + + /* cb->net will have been set on initial ingress */ + cb->src = saddr; + + /* set up common header fields */ + hdr = mctp_hdr(skb); + hdr->ver = 1; + hdr->dest = daddr; + hdr->src = saddr; + + mtu = mctp_route_mtu(rt); + + if (skb->len + sizeof(struct mctp_hdr) <= mtu) { + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | + tag; + return mctp_do_route(rt, skb); + } else { + return mctp_do_fragment_route(rt, skb, mtu, tag); + } +} + +/* route management */ +static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent, unsigned int mtu, + bool is_local) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *ert; + + if (!mctp_address_ok(daddr_start)) + return -EINVAL; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + rt = mctp_route_alloc(); + if (!rt) + return -ENOMEM; + + rt->min = daddr_start; + rt->max = daddr_start + daddr_extent; + rt->mtu = mtu; + rt->dev = mdev; + dev_hold(rt->dev->dev); + rt->output = is_local ? mctp_route_input : mctp_route_output; + + ASSERT_RTNL(); + /* Prevent duplicate identical routes. */ + list_for_each_entry(ert, &net->mctp.routes, list) { + if (mctp_rt_compare_exact(rt, ert)) { + mctp_route_release(rt); + return -EEXIST; + } + } + + list_add_rcu(&rt->list, &net->mctp.routes); + + return 0; +} + +static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + mctp_eid_t daddr_end; + bool dropped; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + daddr_end = daddr_start + daddr_extent; + dropped = false; + + ASSERT_RTNL(); + + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev && + rt->min == daddr_start && rt->max == daddr_end) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + dropped = true; + } + } + + return dropped ? 0 : -ENOENT; +} + +int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_add(mdev, addr, 0, 0, true); +} + +int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_remove(mdev, addr, 0); +} + +/* removes all entries for a given device */ +void mctp_route_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + + ASSERT_RTNL(); + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + } + } +} + +/* Incoming packet-handling */ + +static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct mctp_hdr *mh; + + /* basic non-data sanity checks */ + if (dev->type != ARPHRD_MCTP) + goto err_drop; + + if (!pskb_may_pull(skb, sizeof(struct mctp_hdr))) + goto err_drop; + + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + + /* We have enough for a header; decode and route */ + mh = mctp_hdr(skb); + if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) + goto err_drop; + + cb = __mctp_cb(skb); + rcu_read_lock(); + cb->net = READ_ONCE(__mctp_dev_get(dev)->net); + rcu_read_unlock(); + + rt = mctp_route_lookup(net, cb->net, mh->dest); + if (!rt) + goto err_drop; + + mctp_do_route(rt, skb); + + return NET_RX_SUCCESS; + +err_drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mctp_packet_type = { + .type = cpu_to_be16(ETH_P_MCTP), + .func = mctp_pkttype_receive, +}; + +/* netlink interface */ + +static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { + [RTA_DST] = { .type = NLA_U8 }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing. + * tb must hold RTA_MAX+1 elements. + */ +static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr **tb, struct rtmsg **rtm, + struct mctp_dev **mdev, mctp_eid_t *daddr_start) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + unsigned int ifindex; + int rc; + + rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, + rta_mctp_policy, extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[RTA_DST]) { + NL_SET_ERR_MSG(extack, "dst EID missing"); + return -EINVAL; + } + *daddr_start = nla_get_u8(tb[RTA_DST]); + + if (!tb[RTA_OIF]) { + NL_SET_ERR_MSG(extack, "ifindex missing"); + return -EINVAL; + } + ifindex = nla_get_u32(tb[RTA_OIF]); + + *rtm = nlmsg_data(nlh); + if ((*rtm)->rtm_family != AF_MCTP) { + NL_SET_ERR_MSG(extack, "route family must be AF_MCTP"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + + if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, "no routes to loopback"); + return -EINVAL; + } + + return 0; +} + +static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + unsigned int mtu; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + if (rtm->rtm_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); + return -EINVAL; + } + + /* TODO: parse mtu from nlparse */ + mtu = 0; + + rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, false); + return rc; +} + +static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + /* we only have unicast routes */ + if (rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + + rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len); + return rc; +} + +static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, + u32 portid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *hdr; + void *metrics; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->rtm_family = AF_MCTP; + + /* we use the _len fields as a number of EIDs, rather than + * a number of bits in the address + */ + hdr->rtm_dst_len = rt->max - rt->min; + hdr->rtm_src_len = 0; + hdr->rtm_tos = 0; + hdr->rtm_table = RT_TABLE_DEFAULT; + hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ + hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ + hdr->rtm_type = RTN_ANYCAST; /* TODO: type from route */ + + if (nla_put_u8(skb, RTA_DST, rt->min)) + goto cancel; + + metrics = nla_nest_start_noflag(skb, RTA_METRICS); + if (!metrics) + goto cancel; + + if (rt->mtu) { + if (nla_put_u32(skb, RTAX_MTU, rt->mtu)) + goto cancel; + } + + nla_nest_end(skb, metrics); + + if (rt->dev) { + if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) + goto cancel; + } + + /* TODO: conditional neighbour physaddr? */ + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mctp_route *rt; + int s_idx, idx; + + /* TODO: allow filtering on route data, possibly under + * cb->strict_check + */ + + /* TODO: change to struct overlay */ + s_idx = cb->args[0]; + idx = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (idx++ < s_idx) + continue; + if (mctp_fill_rtinfo(skb, rt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + break; + } + + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +/* net namespace implementation */ +static int __net_init mctp_routes_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->routes); + INIT_HLIST_HEAD(&ns->binds); + mutex_init(&ns->bind_lock); + INIT_HLIST_HEAD(&ns->keys); + spin_lock_init(&ns->keys_lock); + WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET)); + return 0; +} + +static void __net_exit mctp_routes_net_exit(struct net *net) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) + mctp_route_release(rt); +} + +static struct pernet_operations mctp_net_ops = { + .init = mctp_routes_net_init, + .exit = mctp_routes_net_exit, +}; + +int __init mctp_routes_init(void) +{ + dev_add_pack(&mctp_packet_type); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE, + NULL, mctp_dump_rtinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE, + mctp_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE, + mctp_delroute, NULL, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_routes_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_DELROUTE); + rtnl_unregister(PF_MCTP, RTM_NEWROUTE); + rtnl_unregister(PF_MCTP, RTM_GETROUTE); + dev_remove_pack(&mctp_packet_type); +} diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 05a21dd072df..ffeb2df8be7a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -407,7 +407,6 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; - dec.ttl -= 1; /* Find the output device */ out_dev = rcu_dereference(nh->nh_dev); @@ -431,6 +430,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); + dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 551976e4284c..8ecad71b3613 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -99,7 +99,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: - flow_tuple->mtu = ip6_dst_mtu_forward(dst); + flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index baf235721c43..000bb3da4f77 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -187,14 +187,14 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -263,7 +263,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -271,7 +271,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 380f95aacdec..24b7cf447bc5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2545,13 +2545,15 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); + if (err == -ESRCH) + err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); - if (!err || err == -ESRCH) + if (!err) err = err2; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2d6fdf40df66..1afca2a6c2ac 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -40,14 +40,6 @@ void genl_unlock(void) } EXPORT_SYMBOL(genl_unlock); -#ifdef CONFIG_LOCKDEP -bool lockdep_genl_is_held(void) -{ - return lockdep_is_held(&genl_mutex); -} -EXPORT_SYMBOL(lockdep_genl_is_held); -#endif - static void genl_lock_all(void) { down_write(&cb_lock); @@ -1485,6 +1477,7 @@ int genlmsg_multicast_allns(const struct genl_family *family, { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; + group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group, flags); } @@ -1495,14 +1488,12 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - int report = 0; - - if (info->nlhdr) - report = nlmsg_report(info->nlhdr); if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; + group = family->mcgrp_offset + group; - nlmsg_notify(sk, skb, info->snd_portid, group, report, flags); + nlmsg_notify(sk, skb, info->snd_portid, group, + nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify); diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 4a9e72073564..6024fad905ff 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -79,7 +79,7 @@ int __init af_nfc_init(void) return sock_register(&nfc_sock_family_ops); } -void af_nfc_exit(void) +void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); } diff --git a/net/nfc/core.c b/net/nfc/core.c index 573c80c6ff7a..3c645c1d99c9 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -636,7 +636,7 @@ error: return rc; } -int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); @@ -665,7 +665,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, - u8 *gb, size_t gb_len) + const u8 *gb, size_t gb_len) { int rc; @@ -824,7 +824,7 @@ EXPORT_SYMBOL(nfc_targets_found); */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { - struct nfc_target *tg; + const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); @@ -1048,7 +1048,7 @@ struct nfc_dev *nfc_get_device(unsigned int idx) * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ -struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, +struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 5044c7db577e..fefc03674f4f 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -732,7 +732,7 @@ exit: return rc; } -static struct nfc_ops digital_nfc_ops = { +static const struct nfc_ops digital_nfc_ops = { .dev_up = digital_dev_up, .dev_down = digital_dev_down, .start_poll = digital_start_poll, @@ -745,7 +745,7 @@ static struct nfc_ops digital_nfc_ops = { .im_transceive = digital_in_send, }; -struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, +struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops, __u32 supported_protocols, __u32 driver_capabilities, int tx_headroom, int tx_tailroom) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 3481941be70b..ceb87db57cdb 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -128,7 +128,7 @@ static void nfc_hci_msg_rx_work(struct work_struct *work) struct nfc_hci_dev *hdev = container_of(work, struct nfc_hci_dev, msg_rx_work); struct sk_buff *skb; - struct hcp_message *message; + const struct hcp_message *message; u8 pipe; u8 type; u8 instruction; @@ -182,9 +182,9 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 status = NFC_HCI_ANY_OK; - struct hci_create_pipe_resp *create_info; - struct hci_delete_pipe_noti *delete_info; - struct hci_all_pipe_cleared_noti *cleared_info; + const struct hci_create_pipe_resp *create_info; + const struct hci_delete_pipe_noti *delete_info; + const struct hci_all_pipe_cleared_noti *cleared_info; u8 gate; pr_debug("from pipe %x cmd %x\n", pipe, cmd); @@ -447,7 +447,7 @@ static void nfc_hci_cmd_timeout(struct timer_list *t) } static int hci_dev_connect_gates(struct nfc_hci_dev *hdev, u8 gate_count, - struct nfc_hci_gate *gates) + const struct nfc_hci_gate *gates) { int r; while (gate_count--) { @@ -928,7 +928,7 @@ static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return hdev->ops->fw_download(hdev, firmware_name); } -static struct nfc_ops hci_nfc_ops = { +static const struct nfc_ops hci_nfc_ops = { .dev_up = hci_dev_up, .dev_down = hci_dev_down, .start_poll = hci_start_poll, @@ -947,7 +947,7 @@ static struct nfc_ops hci_nfc_ops = { .se_io = hci_se_io, }; -struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, +struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops, struct nfc_hci_init_data *init_data, unsigned long quirks, u32 protocols, diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 6ab40ea17662..2140f6724644 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -11,7 +11,7 @@ static LIST_HEAD(llc_engines); -int nfc_llc_init(void) +int __init nfc_llc_init(void) { int r; @@ -41,7 +41,7 @@ void nfc_llc_exit(void) } } -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops) +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) { struct nfc_llc_engine *llc_engine; diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index 823ddb621e5d..d66271d211a5 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -26,20 +26,20 @@ struct nfc_llc_ops { struct nfc_llc_engine { const char *name; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; struct list_head entry; }; struct nfc_llc { void *data; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; int rx_headroom; int rx_tailroom; }; void *nfc_llc_get_data(struct nfc_llc *llc); -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops); +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops); void nfc_llc_unregister(const char *name); int nfc_llc_nop_register(void); diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index a42852f36f2e..a58716f16954 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -71,7 +71,7 @@ static int llc_nop_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return llc_nop->xmit_to_drv(llc_nop->hdev, skb); } -static struct nfc_llc_ops llc_nop_ops = { +static const struct nfc_llc_ops llc_nop_ops = { .init = llc_nop_init, .deinit = llc_nop_deinit, .start = llc_nop_start, diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 1e3a90049da9..aef750d7787c 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -123,7 +123,7 @@ static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z) return ((y >= x) || (y < z)) ? true : false; } -static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, +static struct sk_buff *llc_shdlc_alloc_skb(const struct llc_shdlc *shdlc, int payload_len) { struct sk_buff *skb; @@ -137,7 +137,7 @@ static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, } /* immediately sends an S frame. */ -static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_s_frame(const struct llc_shdlc *shdlc, enum sframe_type sframe_type, int nr) { int r; @@ -159,7 +159,7 @@ static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, } /* immediately sends an U frame. skb may contain optional payload */ -static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_u_frame(const struct llc_shdlc *shdlc, struct sk_buff *skb, enum uframe_modifier uframe_modifier) { @@ -361,7 +361,7 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r) wake_up(shdlc->connect_wq); } -static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc) { struct sk_buff *skb; @@ -377,7 +377,7 @@ static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET); } -static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc) { struct sk_buff *skb; @@ -820,7 +820,7 @@ static int llc_shdlc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return 0; } -static struct nfc_llc_ops llc_shdlc_ops = { +static const struct nfc_llc_ops llc_shdlc_ops = { .init = llc_shdlc_init, .deinit = llc_shdlc_deinit, .start = llc_shdlc_start, diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index 97853c9cefc7..d49d4bf2e37c 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -221,15 +221,15 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock); /* TLV API */ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); /* Commands API */ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length); +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length); struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap); -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len); void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 475061c79c44..3c4172a5aeb5 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -15,7 +15,7 @@ #include "nfc.h" #include "llcp.h" -static u8 llcp_tlv_length[LLCP_TLV_MAX] = { +static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ @@ -29,7 +29,7 @@ static u8 llcp_tlv_length[LLCP_TLV_MAX] = { }; -static u8 llcp_tlv8(u8 *tlv, u8 type) +static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -37,7 +37,7 @@ static u8 llcp_tlv8(u8 *tlv, u8 type) return tlv[2]; } -static u16 llcp_tlv16(u8 *tlv, u8 type) +static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -46,37 +46,37 @@ static u16 llcp_tlv16(u8 *tlv, u8 type) } -static u8 llcp_tlv_version(u8 *tlv) +static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } -static u16 llcp_tlv_miux(u8 *tlv) +static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } -static u16 llcp_tlv_wks(u8 *tlv) +static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } -static u16 llcp_tlv_lto(u8 *tlv) +static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } -static u8 llcp_tlv_opt(u8 *tlv) +static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } -static u8 llcp_tlv_rw(u8 *tlv) +static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length) +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; @@ -130,7 +130,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) return sdres; } -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; @@ -190,9 +190,10 @@ void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -239,9 +240,10 @@ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -295,7 +297,7 @@ static struct sk_buff *llcp_add_header(struct sk_buff *pdu, return pdu; } -static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv, +static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ @@ -389,9 +391,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *service_name_tlv = NULL, service_name_tlv_length; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *service_name_tlv = NULL; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 service_name_tlv_length, miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; @@ -465,8 +468,9 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index cc997518f79d..eaeb2b1cfa6a 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -301,7 +301,7 @@ static char *wks[] = { "urn:nfc:sn:snep", }; -static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) +static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; @@ -325,7 +325,7 @@ static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; @@ -522,7 +522,7 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; - u8 *version_tlv = NULL, *lto_tlv = NULL, + const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; @@ -612,7 +612,7 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) return local->gb; } -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; @@ -639,27 +639,27 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) local->remote_gb_len - 3); } -static u8 nfc_llcp_dsap(struct sk_buff *pdu) +static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } -static u8 nfc_llcp_ptype(struct sk_buff *pdu) +static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } -static u8 nfc_llcp_ssap(struct sk_buff *pdu) +static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } -static u8 nfc_llcp_ns(struct sk_buff *pdu) +static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } -static u8 nfc_llcp_nr(struct sk_buff *pdu) +static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } @@ -801,7 +801,7 @@ out: } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct nfc_llcp_sock *llcp_sock; @@ -815,9 +815,10 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, return llcp_sock; } -static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len) +static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { - u8 *tlv = &skb->data[2], type, length; + u8 type, length; + const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { @@ -875,7 +876,7 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; @@ -893,7 +894,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, goto fail; } } else { - u8 *sn; + const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); @@ -1112,7 +1113,7 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1155,7 +1156,8 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1188,7 +1190,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1226,12 +1229,13 @@ static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; - u8 dsap, ssap, *tlv, type, length, tid, sap; + u8 dsap, ssap, type, length, tid, sap; + const u8 *tlv; u16 tlv_len, offset; - char *service_name; + const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index da7fe9db1b00..80a5c2a8e9fa 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -53,9 +53,9 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, } int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, - struct dest_spec_params *params) + const struct dest_spec_params *params) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { if (conn_info->dest_type == dest_type) { @@ -210,14 +210,15 @@ static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) } struct nci_set_config_param { - __u8 id; - size_t len; - __u8 *val; + __u8 id; + size_t len; + const __u8 *val; }; static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_set_config_param *param = (struct nci_set_config_param *)opt; + const struct nci_set_config_param *param = + (struct nci_set_config_param *)opt; struct nci_core_set_config_cmd cmd; BUG_ON(param->len > NCI_MAX_PARAM_LEN); @@ -237,7 +238,7 @@ struct nci_rf_discover_param { static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_rf_discover_param *param = + const struct nci_rf_discover_param *param = (struct nci_rf_discover_param *)opt; struct nci_rf_disc_cmd cmd; @@ -303,7 +304,7 @@ struct nci_rf_discover_select_param { static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_rf_discover_select_param *param = + const struct nci_rf_discover_select_param *param = (struct nci_rf_discover_select_param *)opt; struct nci_rf_discover_select_cmd cmd; @@ -341,18 +342,18 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) struct nci_cmd_param { __u16 opcode; size_t len; - __u8 *payload; + const __u8 *payload; }; static void nci_generic_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_cmd_param *param = + const struct nci_cmd_param *param = (struct nci_cmd_param *)opt; nci_send_cmd(ndev, param->opcode, param->len, param->payload); } -int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload) { struct nci_cmd_param param; @@ -365,7 +366,8 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) } EXPORT_SYMBOL(nci_prop_cmd); -int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) +int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, + const __u8 *payload) { struct nci_cmd_param param; @@ -399,7 +401,7 @@ struct nci_loopback_data { static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_loopback_data *data = (struct nci_loopback_data *)opt; + const struct nci_loopback_data *data = (struct nci_loopback_data *)opt; nci_send_data(ndev, data->conn_id, data->data); } @@ -407,7 +409,7 @@ static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); if (!conn_info) { @@ -420,7 +422,7 @@ static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) nci_req_complete(ndev, NCI_STATUS_OK); } -int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, +int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len, struct sk_buff **resp) { int r; @@ -624,7 +626,7 @@ static int nci_dev_down(struct nfc_dev *nfc_dev) return nci_close_device(ndev); } -int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) +int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val) { struct nci_set_config_param param; @@ -659,7 +661,7 @@ EXPORT_SYMBOL(nci_nfcee_discover); static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_nfcee_mode_set_cmd *cmd = + const struct nci_nfcee_mode_set_cmd *cmd = (struct nci_nfcee_mode_set_cmd *)opt; nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD, @@ -681,7 +683,7 @@ EXPORT_SYMBOL(nci_nfcee_mode_set); static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) { - struct core_conn_create_data *data = + const struct core_conn_create_data *data = (struct core_conn_create_data *)opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd); @@ -690,7 +692,7 @@ static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, u8 number_destination_params, size_t params_len, - struct core_conn_create_dest_spec_params *params) + const struct core_conn_create_dest_spec_params *params) { int r; struct nci_core_conn_create_cmd *cmd; @@ -863,7 +865,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_rf_discover_select_param param; - struct nfc_target *nci_target = NULL; + const struct nfc_target *nci_target = NULL; int i; int rc = 0; @@ -1004,7 +1006,7 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; if (!conn_info) @@ -1102,7 +1104,7 @@ static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return ndev->ops->fw_download(ndev, firmware_name); } -static struct nfc_ops nci_nfc_ops = { +static const struct nfc_ops nci_nfc_ops = { .dev_up = nci_dev_up, .dev_down = nci_dev_down, .start_poll = nci_start_poll, @@ -1129,7 +1131,7 @@ static struct nfc_ops nci_nfc_ops = { * @tx_headroom: Reserved space at beginning of skb * @tx_tailroom: Reserved space at end of skb */ -struct nci_dev *nci_allocate_device(struct nci_ops *ops, +struct nci_dev *nci_allocate_device(const struct nci_ops *ops, __u32 supported_protocols, int tx_headroom, int tx_tailroom) { @@ -1152,8 +1154,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { pr_err("Too many proprietary commands: %zd\n", ops->n_prop_ops); - ops->prop_ops = NULL; - ops->n_prop_ops = 0; + goto free_nci; } ndev->tx_headroom = tx_headroom; @@ -1270,7 +1271,7 @@ EXPORT_SYMBOL(nci_register_device); */ void nci_unregister_device(struct nci_dev *ndev) { - struct nci_conn_info *conn_info, *n; + struct nci_conn_info *conn_info, *n; nci_close_device(ndev); @@ -1332,7 +1333,7 @@ int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) EXPORT_SYMBOL(nci_send_frame); /* Send NCI command */ -int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) +int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload) { struct nci_ctrl_hdr *hdr; struct sk_buff *skb; @@ -1364,12 +1365,12 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) EXPORT_SYMBOL(nci_send_cmd); /* Proprietary commands API */ -static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, - size_t n_ops, - __u16 opcode) +static const struct nci_driver_ops *ops_cmd_lookup(const struct nci_driver_ops *ops, + size_t n_ops, + __u16 opcode) { size_t i; - struct nci_driver_ops *op; + const struct nci_driver_ops *op; if (!ops || !n_ops) return NULL; @@ -1384,10 +1385,10 @@ static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, } static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, rsp_opcode); if (!op || !op->rsp) @@ -1397,10 +1398,10 @@ static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, } static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, ntf_opcode); if (!op || !op->ntf) @@ -1442,7 +1443,7 @@ int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, static void nci_tx_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, tx_work); - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index ce3382be937f..6055dc9a82aa 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -26,7 +26,7 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, __u8 conn_id, int err) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; data_exchange_cb_t cb; void *cb_context; @@ -80,7 +80,7 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev, int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) @@ -93,9 +93,9 @@ EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size); static int nci_queue_tx_data_frags(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int total_len = skb->len; - unsigned char *data = skb->data; + const unsigned char *data = skb->data; unsigned long flags; struct sk_buff_head frags_q; struct sk_buff *skb_frag; @@ -166,7 +166,7 @@ exit: /* Send NCI data */ int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int rc = 0; pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len); @@ -269,7 +269,7 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) __u8 pbf = nci_pbf(skb->data); __u8 status = 0; __u8 conn_id = nci_conn_id(skb->data); - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; pr_debug("len %d\n", skb->len); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index d6732e5e8958..a8ff794a8084 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -16,11 +16,11 @@ #include <linux/nfc.h> struct nci_data { - u8 conn_id; - u8 pipe; - u8 cmd; - const u8 *data; - u32 data_len; + u8 conn_id; + u8 pipe; + u8 cmd; + const u8 *data; + u32 data_len; } __packed; struct nci_hci_create_pipe_params { @@ -142,7 +142,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; @@ -197,7 +197,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt) { - struct nci_data *data = (struct nci_data *)opt; + const struct nci_data *data = (struct nci_data *)opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); @@ -221,8 +221,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -363,7 +363,7 @@ exit: static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -406,7 +406,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work) struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; - struct nci_hcp_message *message; + const struct nci_hcp_message *message; u8 pipe, type, instruction; while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { @@ -498,7 +498,7 @@ void nci_hci_data_received_cb(void *context, int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -523,7 +523,7 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; - struct nci_hci_create_pipe_resp *resp; + const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); @@ -557,8 +557,8 @@ static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; @@ -605,8 +605,8 @@ EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -697,7 +697,7 @@ EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, - struct nci_hci_gate *gates) + const struct nci_hci_gate *gates) { int r; @@ -714,7 +714,7 @@ static int nci_hci_dev_connect_gates(struct nci_dev *ndev, int nci_hci_dev_session_init(struct nci_dev *ndev) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; int r; diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 98af04c86b2c..c5eacaac41ae 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -28,10 +28,10 @@ /* Handle NCI Notification packets */ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf = (void *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -48,7 +48,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; int i; pr_debug("num_entries %d\n", ntf->num_entries); @@ -80,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, } static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -107,9 +107,10 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); } -static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfca_poll *nfca_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfca_poll *nfca_poll, + const __u8 *data) { nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; @@ -134,9 +135,10 @@ static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcb_poll *nfcb_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcb_poll *nfcb_poll, + const __u8 *data) { nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); @@ -148,9 +150,10 @@ static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_poll *nfcf_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_poll *nfcf_poll, + const __u8 *data) { nfcf_poll->bit_rate = *data++; nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); @@ -164,9 +167,10 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcv_poll *nfcv_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + const __u8 *data) { ++data; nfcv_poll->dsfid = *data++; @@ -175,9 +179,10 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_listen *nfcf_listen, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_listen *nfcf_listen, + const __u8 *data) { nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); @@ -198,12 +203,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, __u8 rf_tech_and_mode, - void *params) + const void *params) { - struct rf_tech_specific_params_nfca_poll *nfca_poll; - struct rf_tech_specific_params_nfcb_poll *nfcb_poll; - struct rf_tech_specific_params_nfcf_poll *nfcf_poll; - struct rf_tech_specific_params_nfcv_poll *nfcv_poll; + const struct rf_tech_specific_params_nfca_poll *nfca_poll; + const struct rf_tech_specific_params_nfcb_poll *nfcb_poll; + const struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + const struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) @@ -274,7 +279,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, } static void nci_add_new_target(struct nci_dev *ndev, - struct nci_rf_discover_ntf *ntf) + const struct nci_rf_discover_ntf *ntf) { struct nfc_target *target; int i, rc; @@ -319,10 +324,10 @@ void nci_clear_target_list(struct nci_dev *ndev) } static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; bool add_target = true; ntf.rf_discovery_id = *data++; @@ -382,7 +387,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_nfca_poll_iso_dep *nfca_poll; struct activation_params_nfcb_poll_iso_dep *nfcb_poll; @@ -418,7 +424,8 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, } static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_poll_nfc_dep *poll; struct activation_params_listen_nfc_dep *listen; @@ -454,7 +461,7 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, } static void nci_target_auto_activated(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { struct nfc_target *target; int rc; @@ -477,7 +484,7 @@ static void nci_target_auto_activated(struct nci_dev *ndev, } static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { ndev->remote_gb_len = 0; @@ -519,11 +526,11 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, } static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; int err = NCI_STATUS_OK; ntf.rf_discovery_id = *data++; @@ -681,10 +688,10 @@ listen: } static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; - struct nci_rf_deactivate_ntf *ntf = (void *) skb->data; + const struct nci_conn_info *conn_info; + const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); @@ -725,10 +732,10 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - struct nci_nfcee_discover_ntf *nfcee_ntf = + const struct nci_nfcee_discover_ntf *nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; pr_debug("\n"); @@ -745,7 +752,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, } static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { pr_debug("\n"); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index e9605922a322..a2e72c003805 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -25,9 +25,10 @@ /* Handle NCI Response packets */ -static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_reset_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_reset_rsp *rsp = (void *) skb->data; + const struct nci_core_reset_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -43,10 +44,11 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) } } -static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data; - struct nci_core_init_rsp_2 *rsp_2; + const struct nci_core_init_rsp_1 *rsp_1 = (void *)skb->data; + const struct nci_core_init_rsp_2 *rsp_2; pr_debug("status 0x%x\n", rsp_1->status); @@ -81,10 +83,11 @@ static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; - u8 *supported_rf_interface = rsp->supported_rf_interfaces; + const struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; + const u8 *supported_rf_interface = rsp->supported_rf_interfaces; u8 rf_interface_idx = 0; u8 rf_extension_cnt = 0; @@ -118,7 +121,7 @@ static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_init_rsp_packet(struct nci_dev *ndev, const struct sk_buff *skb) { u8 status = 0; @@ -160,9 +163,9 @@ exit: } static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_core_set_config_rsp *rsp = (void *) skb->data; + const struct nci_core_set_config_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -170,7 +173,7 @@ static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, } static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -179,9 +182,10 @@ static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, nci_req_complete(ndev, status); } -static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; __u8 status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -210,7 +214,7 @@ exit: } static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -222,7 +226,7 @@ static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, } static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -238,9 +242,9 @@ static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_nfcee_discover_rsp *discover_rsp; + const struct nci_nfcee_discover_rsp *discover_rsp; if (skb->len != 2) { nci_req_complete(ndev, NCI_STATUS_NFCEE_PROTOCOL_ERROR); @@ -255,7 +259,7 @@ static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -264,11 +268,11 @@ static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, } static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; struct nci_conn_info *conn_info = NULL; - struct nci_core_conn_create_rsp *rsp; + const struct nci_core_conn_create_rsp *rsp; pr_debug("status 0x%x\n", status); @@ -319,7 +323,7 @@ exit: } static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_conn_info *conn_info; __u8 status = skb->data[0]; diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 7d8e10e27c20..0935527d1d12 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -27,7 +27,7 @@ #define CRC_INIT 0xFFFF -static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb, +static int __nci_spi_send(struct nci_spi *nspi, const struct sk_buff *skb, int cs_change) { struct spi_message m; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 722f7ef891e1..49089c50872e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -530,7 +530,7 @@ free_msg: int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { - struct nfc_se *se; + const struct nfc_se *se; struct sk_buff *msg; void *hdr; @@ -1531,7 +1531,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; - struct nfc_vendor_cmd *cmd; + const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 889fefd64e56..de2ec66d7e83 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -48,7 +48,7 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_llcp_register_device(struct nfc_dev *dev); void nfc_llcp_unregister_device(struct nfc_dev *dev); -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len); +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len); u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len); int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ef15d9eb4774..076774034bb9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -924,7 +924,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); + if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, + smp_processor_id()); + else + upcall.portid = nla_get_u32(a); break; case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index bc164b35e67d..67ad08320886 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, static void ovs_dp_masks_rebalance(struct work_struct *work); +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -239,7 +242,13 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) @@ -1594,16 +1603,70 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} + static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0; + int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | - OVS_DP_F_TC_RECIRC_SHARING)) + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) @@ -1624,6 +1687,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) dp->user_features = user_features; + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) static_branch_enable(&tc_recirc_sharing_support); else diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 38f7d3e66ca6..fcfe6cb46441 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -51,6 +51,21 @@ struct dp_stats_percpu { }; /** + * struct dp_nlsk_pids - array of netlink portids of for a datapath. + * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU + * is enabled and must be protected by rcu. + * @rcu: RCU callback head for deferred destruction. + * @n_pids: Size of @pids array. + * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets + * that miss the flow table. + */ +struct dp_nlsk_pids { + struct rcu_head rcu; + u32 n_pids; + u32 pids[]; +}; + +/** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. @@ -61,6 +76,7 @@ struct dp_stats_percpu { * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. + * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -87,6 +103,8 @@ struct datapath { /* Switch meters. */ struct dp_meter_table meter_tbl; + + struct dp_nlsk_pids __rcu *upcall_portids; }; /** @@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); + const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index ac0fae06cc15..876d0ae5f9fd 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -233,11 +233,11 @@ static int phonet_device_autoconf(struct net_device *dev) struct if_phonet_req req; int ret; - if (!dev->netdev_ops->ndo_do_ioctl) + if (!dev->netdev_ops->ndo_siocdevprivate) return -EOPNOTSUPP; - ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req, - SIOCPNGAUTOCONF); + ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req, + NULL, SIOCPNGAUTOCONF); if (ret < 0) return ret; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 171b7f3be6ef..6c61b7b1838f 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1157,14 +1157,14 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: - if (copy_from_user(&ifr, argp, sizeof(ifr))) { + if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; - if (copy_to_user(argp, &ifr, sizeof(ifr))) { + if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d17a66aab8ee..998a2374f7ae 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1351,8 +1351,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - return 0; if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); @@ -1423,8 +1421,6 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } @@ -1481,7 +1477,6 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; - int err = 0; skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, GFP_KERNEL); @@ -1495,11 +1490,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 8d17a543cc9f..762ceec3e6f6 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/inet_ecn.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -21,15 +22,13 @@ static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; -#define MAX_EDIT_LEN ETH_HLEN static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action; + int action, max_edit_len, err; struct tcf_skbmod_params *p; u64 flags; - int err; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); @@ -38,19 +37,34 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, if (unlikely(action == TC_ACT_SHOT)) goto drop; - if (!skb->dev || skb->dev->type != ARPHRD_ETHER) - return action; + max_edit_len = skb_mac_header_len(skb); + p = rcu_dereference_bh(d->skbmod_p); + flags = p->flags; + + /* tcf_skbmod_init() guarantees "flags" to be one of the following: + * 1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE} + * 2. SKBMOD_F_SWAPMAC + * 3. SKBMOD_F_ECN + * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet + * packets. + */ + if (flags == SKBMOD_F_ECN) { + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + case cpu_to_be16(ETH_P_IPV6): + max_edit_len += skb_network_header_len(skb); + break; + default: + goto out; + } + } else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) { + goto out; + } - /* XXX: if you are going to edit more fields beyond ethernet header - * (example when you add IP header replacement or vlan swap) - * then MAX_EDIT_LEN needs to change appropriately - */ - err = skb_ensure_writable(skb, MAX_EDIT_LEN); + err = skb_ensure_writable(skb, max_edit_len); if (unlikely(err)) /* best policy is to drop on the floor */ goto drop; - p = rcu_dereference_bh(d->skbmod_p); - flags = p->flags; if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); if (flags & SKBMOD_F_SMAC) @@ -66,6 +80,10 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr); } + if (flags & SKBMOD_F_ECN) + INET_ECN_set_ce(skb); + +out: return action; drop: @@ -129,6 +147,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, index = parm->index; if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; + if (parm->flags & SKBMOD_F_ECN) + lflags = SKBMOD_F_ECN; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e3e79e9bd706..7be5b9d2aead 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1577,21 +1577,11 @@ reset: #endif } -int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +int tcf_classify(struct sk_buff *skb, + const struct tcf_block *block, + const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { - u32 last_executed_chain = 0; - - return __tcf_classify(skb, tp, tp, res, compat_mode, - &last_executed_chain); -} -EXPORT_SYMBOL(tcf_classify); - -int tcf_classify_ingress(struct sk_buff *skb, - const struct tcf_block *ingress_block, - const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; @@ -1603,20 +1593,22 @@ int tcf_classify_ingress(struct sk_buff *skb, struct tc_skb_ext *ext; int ret; - ext = skb_ext_find(skb, TC_SKB_EXT); + if (block) { + ext = skb_ext_find(skb, TC_SKB_EXT); - if (ext && ext->chain) { - struct tcf_chain *fchain; + if (ext && ext->chain) { + struct tcf_chain *fchain; - fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain); - if (!fchain) - return TC_ACT_SHOT; + fchain = tcf_chain_lookup_rcu(block, ext->chain); + if (!fchain) + return TC_ACT_SHOT; - /* Consume, so cloned/redirect skbs won't inherit ext */ - skb_ext_del(skb, TC_SKB_EXT); + /* Consume, so cloned/redirect skbs won't inherit ext */ + skb_ext_del(skb, TC_SKB_EXT); - tp = rcu_dereference_bh(fchain->filter_chain); - last_executed_chain = fchain->index; + tp = rcu_dereference_bh(fchain->filter_chain); + last_executed_chain = fchain->index; + } } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, @@ -1635,7 +1627,7 @@ int tcf_classify_ingress(struct sk_buff *skb, return ret; #endif } -EXPORT_SYMBOL(tcf_classify_ingress); +EXPORT_SYMBOL(tcf_classify); struct tcf_chain_info { struct tcf_proto __rcu **pprev; @@ -1870,13 +1862,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - - if (err > 0) - err = 0; return err; } @@ -1909,15 +1898,13 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); - if (err > 0) - err = 0; return err; } @@ -2711,13 +2698,11 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } @@ -2741,7 +2726,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, } if (unicast) - return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + return rtnl_unicast(skb, net, portid); return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } @@ -3832,7 +3817,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru fl = rcu_dereference_bh(qe->filter_chain); - switch (tcf_classify(skb, fl, &cl_res, false)) { + switch (tcf_classify(skb, NULL, fl, &cl_res, false)) { case TC_ACT_SHOT: qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f87d07736a14..5e90e9b160e3 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1845,7 +1845,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) @@ -1856,11 +1855,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, @@ -1894,8 +1890,6 @@ static int tclass_del_notify(struct net *net, err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index d0c9a57398fc..7d8518176b45 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -394,7 +394,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 951542843cab..ecc5c4d93779 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1665,7 +1665,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index b79a7e27bb31..2dabaffd39d0 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -228,7 +228,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index fc1e47069593..642cd179b7a7 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -317,7 +317,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index d320bcfb2da2..4c100d105269 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -242,7 +242,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, fl, &res, false); + int result = tcf_classify(skb, NULL, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index c1e84d1eeaba..925924fab1ab 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -390,7 +390,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index bbd5f8753600..c4afdd026f51 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -91,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index cac684952edc..830f3559f727 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -94,7 +94,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_pie_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index bf0034c66e35..b7ac30cca035 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1130,7 +1130,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5f7ac27a5264..81ea8332547a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -238,7 +238,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 5c27b4270b90..e282e7382117 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -36,7 +36,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 3eabb871a1d5..03fdf31ccb6a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -39,7 +39,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index b692a0de1ad5..58a9d42b52b8 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -690,7 +690,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index dde829d4b9f8..3d061a13d7ed 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -257,7 +257,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 066754a18569..f8e569f79f13 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -178,7 +178,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/socket.c b/net/socket.c index 0b2dad3bdf7f..3c10504e46d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -212,6 +212,7 @@ static const char * const pf_family_names[] = { [PF_QIPCRTR] = "PF_QIPCRTR", [PF_SMC] = "PF_SMC", [PF_XDP] = "PF_XDP", + [PF_MCTP] = "PF_MCTP", }; /* @@ -1064,9 +1065,13 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg); +static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg); -void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) +void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; @@ -1074,6 +1079,22 @@ void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) } EXPORT_SYMBOL(brioctl_set); +int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) +{ + int err = -ENOPKG; + + if (!br_ioctl_hook) + request_module("bridge"); + + mutex_lock(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(net, br, cmd, ifr, uarg); + mutex_unlock(&br_ioctl_mutex); + + return err; +} + static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); @@ -1088,8 +1109,11 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + struct ifreq ifr; + bool need_copyout; int err; void __user *argp = (void __user *)arg; + void __user *data; err = sock->ops->ioctl(sock, cmd, arg); @@ -1100,25 +1124,13 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, if (err != -ENOIOCTLCMD) return err; - if (cmd == SIOCGIFCONF) { - struct ifconf ifc; - if (copy_from_user(&ifc, argp, sizeof(struct ifconf))) - return -EFAULT; - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct ifreq)); - rtnl_unlock(); - if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf))) - err = -EFAULT; - } else { - struct ifreq ifr; - bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, &data, argp)) + return -EFAULT; + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); + if (!err && need_copyout) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); - if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) - return -EFAULT; - } + return err; } @@ -1140,12 +1152,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; + void __user *data; bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE @@ -1170,14 +1183,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = -ENOPKG; - if (!br_ioctl_hook) - request_module("bridge"); - - mutex_lock(&br_ioctl_mutex); - if (br_ioctl_hook) - err = br_ioctl_hook(net, cmd, argp); - mutex_unlock(&br_ioctl_mutex); + err = br_ioctl_call(net, NULL, cmd, NULL, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -1217,6 +1223,11 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) cmd == SIOCGSTAMP_NEW, false); break; + + case SIOCGIFCONF: + err = dev_ifconf(net, argp); + break; + default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -3126,154 +3137,55 @@ void socket_seq_show(struct seq_file *seq) } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_COMPAT -static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) +/* Handle the fact that while struct ifreq has the same *layout* on + * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, + * which are handled elsewhere, it still has different *size* due to + * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, + * resulting in struct ifreq being 32 and 40 bytes respectively). + * As a result, if the struct happens to be at the end of a page and + * the next page isn't readable/writable, we get a fault. To prevent + * that, copy back and forth to the full size. + */ +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { - struct compat_ifconf ifc32; - struct ifconf ifc; - int err; + if (in_compat_syscall()) { + struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; - if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) - return -EFAULT; + memset(ifr, 0, sizeof(*ifr)); + if (copy_from_user(ifr32, arg, sizeof(*ifr32))) + return -EFAULT; - ifc.ifc_len = ifc32.ifc_len; - ifc.ifc_req = compat_ptr(ifc32.ifcbuf); + if (ifrdata) + *ifrdata = compat_ptr(ifr32->ifr_data); - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq)); - rtnl_unlock(); - if (err) - return err; + return 0; + } - ifc32.ifc_len = ifc.ifc_len; - if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) + if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; + if (ifrdata) + *ifrdata = ifr->ifr_data; + return 0; } +EXPORT_SYMBOL(get_user_ifreq); -static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) +int put_user_ifreq(struct ifreq *ifr, void __user *arg) { - struct compat_ethtool_rxnfc __user *compat_rxnfc; - bool convert_in = false, convert_out = false; - size_t buf_size = 0; - struct ethtool_rxnfc __user *rxnfc = NULL; - struct ifreq ifr; - u32 rule_cnt = 0, actual_rule_cnt; - u32 ethcmd; - u32 data; - int ret; - - if (get_user(data, &ifr32->ifr_ifru.ifru_data)) - return -EFAULT; - - compat_rxnfc = compat_ptr(data); - - if (get_user(ethcmd, &compat_rxnfc->cmd)) - return -EFAULT; + size_t size = sizeof(*ifr); - /* Most ethtool structures are defined without padding. - * Unfortunately struct ethtool_rxnfc is an exception. - */ - switch (ethcmd) { - default: - break; - case ETHTOOL_GRXCLSRLALL: - /* Buffer size is variable */ - if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) - return -EFAULT; - if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) - return -ENOMEM; - buf_size += rule_cnt * sizeof(u32); - fallthrough; - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_SRXCLSRLINS: - convert_out = true; - fallthrough; - case ETHTOOL_SRXCLSRLDEL: - buf_size += sizeof(struct ethtool_rxnfc); - convert_in = true; - rxnfc = compat_alloc_user_space(buf_size); - break; - } + if (in_compat_syscall()) + size = sizeof(struct compat_ifreq); - if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + if (copy_to_user(arg, ifr, size)) return -EFAULT; - ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc; - - if (convert_in) { - /* We expect there to be holes between fs.m_ext and - * fs.ring_cookie and at the end of fs, but nowhere else. - */ - BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + - sizeof(compat_rxnfc->fs.m_ext) != - offsetof(struct ethtool_rxnfc, fs.m_ext) + - sizeof(rxnfc->fs.m_ext)); - BUILD_BUG_ON( - offsetof(struct compat_ethtool_rxnfc, fs.location) - - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != - offsetof(struct ethtool_rxnfc, fs.location) - - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); - - if (copy_in_user(rxnfc, compat_rxnfc, - (void __user *)(&rxnfc->fs.m_ext + 1) - - (void __user *)rxnfc) || - copy_in_user(&rxnfc->fs.ring_cookie, - &compat_rxnfc->fs.ring_cookie, - (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie)) - return -EFAULT; - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - if (put_user(rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - } else if (copy_in_user(&rxnfc->rule_cnt, - &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - } - - ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL); - if (ret) - return ret; - - if (convert_out) { - if (copy_in_user(compat_rxnfc, rxnfc, - (const void __user *)(&rxnfc->fs.m_ext + 1) - - (const void __user *)rxnfc) || - copy_in_user(&compat_rxnfc->fs.ring_cookie, - &rxnfc->fs.ring_cookie, - (const void __user *)(&rxnfc->fs.location + 1) - - (const void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - /* As an optimisation, we only copy the actual - * number of rules that the underlying - * function returned. Since Mallory might - * change the rule count in user memory, we - * check that it is less than the rule count - * originally given (as the user buffer size), - * which has been range-checked. - */ - if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - if (actual_rule_cnt < rule_cnt) - rule_cnt = actual_rule_cnt; - if (copy_in_user(&compat_rxnfc->rule_locs[0], - &rxnfc->rule_locs[0], - rule_cnt * sizeof(u32))) - return -EFAULT; - } - } - return 0; } +EXPORT_SYMBOL(put_user_ifreq); +#ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; @@ -3281,7 +3193,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 void __user *saved; int err; - if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq))) + if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) @@ -3290,10 +3202,10 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); - err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL); + err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; - if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq))) + if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; @@ -3304,97 +3216,13 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; - u32 data32; + void __user *data; - if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ)) - return -EFAULT; - if (get_user(data32, &u_ifreq32->ifr_data)) + if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; - ifreq.ifr_data = compat_ptr(data32); + ifreq.ifr_data = data; - return dev_ioctl(net, cmd, &ifreq, NULL); -} - -static int compat_ifreq_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq __user *uifr; - int err; - - /* Handle the fact that while struct ifreq has the same *layout* on - * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, - * which are handled elsewhere, it still has different *size* due to - * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, - * resulting in struct ifreq being 32 and 40 bytes respectively). - * As a result, if the struct happens to be at the end of a page and - * the next page isn't readable/writable, we get a fault. To prevent - * that, copy back and forth to the full size. - */ - - uifr = compat_alloc_user_space(sizeof(*uifr)); - if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) - return -EFAULT; - - err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); - - if (!err) { - switch (cmd) { - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFMEM: - case SIOCGIFHWADDR: - case SIOCGIFINDEX: - case SIOCGIFADDR: - case SIOCGIFBRDADDR: - case SIOCGIFDSTADDR: - case SIOCGIFNETMASK: - case SIOCGIFPFLAGS: - case SIOCGIFTXQLEN: - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCGIFNAME: - if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) - err = -EFAULT; - break; - } - } - return err; -} - -static int compat_sioc_ifmap(struct net *net, unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq ifr; - struct compat_ifmap __user *uifmap32; - int err; - - uifmap32 = &uifr32->ifr_ifru.ifru_map; - err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= get_user(ifr.ifr_map.port, &uifmap32->port); - if (err) - return -EFAULT; - - err = dev_ioctl(net, cmd, &ifr, NULL); - - if (cmd == SIOCGIFMAP && !err) { - err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= put_user(ifr.ifr_map.port, &uifmap32->port); - if (err) - err = -EFAULT; - } - return err; + return dev_ioctl(net, cmd, &ifreq, data, NULL); } /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE @@ -3420,21 +3248,14 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, struct net *net = sock_net(sk); if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) - return compat_ifr_data_ioctl(net, cmd, argp); + return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCSIFBR: case SIOCGIFBR: return old_bridge_ioctl(argp); - case SIOCGIFCONF: - return compat_dev_ifconf(net, argp); - case SIOCETHTOOL: - return ethtool_ioctl(net, argp); case SIOCWANDEV: return compat_siocwandev(net, argp); - case SIOCGIFMAP: - case SIOCSIFMAP: - return compat_sioc_ifmap(net, cmd, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) @@ -3442,6 +3263,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); + case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: @@ -3459,10 +3281,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: + case SIOCGIFCONF: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: + case SIOCGIFMAP: + case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: @@ -3499,8 +3324,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - return compat_ifreq_ioctl(net, sock, cmd, argp); - case SIOCSARP: case SIOCGARP: case SIOCDARP: diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 070698dd19bc..0ae3478561f4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -378,6 +378,266 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); +struct switchdev_nested_priv { + bool (*check_cb)(const struct net_device *dev); + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + const struct net_device *dev; + struct net_device *lower_dev; +}; + +static int switchdev_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + struct switchdev_nested_priv *switchdev_priv = priv->data; + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + bool (*check_cb)(const struct net_device *dev); + const struct net_device *dev; + + check_cb = switchdev_priv->check_cb; + foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; + dev = switchdev_priv->dev; + + if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { + switchdev_priv->lower_dev = lower_dev; + return 1; + } + + return 0; +} + +static struct net_device * +switchdev_lower_dev_find(struct net_device *dev, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev)) +{ + struct switchdev_nested_priv switchdev_priv = { + .check_cb = check_cb, + .foreign_dev_check_cb = foreign_dev_check_cb, + .dev = dev, + .lower_dev = NULL, + }; + struct netdev_nested_priv priv = { + .data = &switchdev_priv, + }; + + netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); + + return switchdev_priv.lower_dev; +} + +static int __switchdev_handle_fdb_add_to_device(struct net_device *dev, + const struct net_device *orig_dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + const struct switchdev_notifier_info *info = &fdb_info->info; + struct net_device *br, *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) + return add_cb(dev, orig_dev, info->ctx, fdb_info); + + if (netif_is_lag_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + goto maybe_bridged_with_us; + + /* This is a LAG interface that we offload */ + if (!lag_add_cb) + return -EOPNOTSUPP; + + return lag_add_cb(dev, orig_dev, info->ctx, fdb_info); + } + + /* Recurse through lower interfaces in case the FDB entry is pointing + * towards a bridge device. + */ + if (netif_is_bridge_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + return 0; + + /* This is a bridge interface that we offload */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; + + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_add_to_device(lower_dev, orig_dev, + fdb_info, check_cb, + foreign_dev_check_cb, + add_cb, lag_add_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return 0; + } + +maybe_bridged_with_us: + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + br = netdev_master_upper_dev_get_rcu(dev); + if (!br || !netif_is_bridge_master(br)) + return 0; + + if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + return 0; + + return __switchdev_handle_fdb_add_to_device(br, orig_dev, fdb_info, + check_cb, foreign_dev_check_cb, + add_cb, lag_add_cb); +} + +int switchdev_handle_fdb_add_to_device(struct net_device *dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + int err; + + err = __switchdev_handle_fdb_add_to_device(dev, dev, fdb_info, + check_cb, + foreign_dev_check_cb, + add_cb, lag_add_cb); + if (err == -EOPNOTSUPP) + err = 0; + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_fdb_add_to_device); + +static int __switchdev_handle_fdb_del_to_device(struct net_device *dev, + const struct net_device *orig_dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + const struct switchdev_notifier_info *info = &fdb_info->info; + struct net_device *br, *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) + return del_cb(dev, orig_dev, info->ctx, fdb_info); + + if (netif_is_lag_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + goto maybe_bridged_with_us; + + /* This is a LAG interface that we offload */ + if (!lag_del_cb) + return -EOPNOTSUPP; + + return lag_del_cb(dev, orig_dev, info->ctx, fdb_info); + } + + /* Recurse through lower interfaces in case the FDB entry is pointing + * towards a bridge device. + */ + if (netif_is_bridge_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + return 0; + + /* This is a bridge interface that we offload */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; + + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_del_to_device(lower_dev, orig_dev, + fdb_info, check_cb, + foreign_dev_check_cb, + del_cb, lag_del_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return 0; + } + +maybe_bridged_with_us: + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + br = netdev_master_upper_dev_get_rcu(dev); + if (!br || !netif_is_bridge_master(br)) + return 0; + + if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + return 0; + + return __switchdev_handle_fdb_del_to_device(br, orig_dev, fdb_info, + check_cb, foreign_dev_check_cb, + del_cb, lag_del_cb); +} + +int switchdev_handle_fdb_del_to_device(struct net_device *dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + int err; + + err = __switchdev_handle_fdb_del_to_device(dev, dev, fdb_info, + check_cb, + foreign_dev_check_cb, + del_cb, lag_del_cb); + if (err == -EOPNOTSUPP) + err = 0; + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_fdb_del_to_device); + static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 75b99b7eda22..b15b2b1b2f38 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1886,6 +1886,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_skb_cb *skb_cb; struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; @@ -1909,6 +1910,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); @@ -1928,18 +1930,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { - copy = min_t(int, dlen, buflen); - if (unlikely(copy != dlen)) - m->msg_flags |= MSG_TRUNC; - rc = skb_copy_datagram_msg(skb, hlen, m, copy); + int offset = skb_cb->bytes_read; + + copy = min_t(int, dlen - offset, buflen); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + goto exit; + if (unlikely(offset + copy < dlen)) { + if (flags & MSG_EOR) { + if (!(flags & MSG_PEEK)) + skb_cb->bytes_read = offset + copy; + } else { + m->msg_flags |= MSG_TRUNC; + skb_cb->bytes_read = 0; + } + } else { + if (flags & MSG_EOR) + m->msg_flags |= MSG_EOR; + skb_cb->bytes_read = 0; + } } else { copy = 0; rc = 0; - if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) { rc = -ECONNRESET; + goto exit; + } } - if (unlikely(rc)) - goto exit; /* Mark message as group event if applicable */ if (unlikely(grp_evt)) { @@ -1962,6 +1979,9 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, tipc_node_distr_xmit(sock_net(sk), &xmitq); } + if (skb_cb->bytes_read) + goto exit; + tsk_advance_rx_queue(sk); if (likely(!connected)) diff --git a/net/unix/Makefile b/net/unix/Makefile index 54e58cc4f945..20491825b4d0 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX) += unix.o unix-y := af_unix.o garbage.o unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o +unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ba7ced947e51..256c4e31132e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -494,6 +494,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) sk_error_report(other); } } + sk->sk_state = other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -669,6 +670,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -746,6 +749,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, + .read_sock = unix_read_sock, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -777,10 +781,21 @@ static const struct proto_ops unix_seqpacket_ops = { .show_fdinfo = unix_show_fdinfo, }; -static struct proto unix_proto = { +static void unix_close(struct sock *sk, long timeout) +{ + /* Nothing to do here, unix socket does not need a ->close(). + * This is merely for sockmap. + */ +} + +struct proto unix_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_bpf_update_proto, +#endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) @@ -864,6 +879,7 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; @@ -1199,6 +1215,9 @@ restart: unix_peer(sk) = other; unix_state_double_unlock(sk, other); } + + if (unix_peer(sk)) + sk->sk_state = other->sk_state = TCP_ESTABLISHED; return 0; out_unlock: @@ -1431,12 +1450,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) init_peercred(ska); init_peercred(skb); - if (ska->sk_type != SOCK_DGRAM) { - ska->sk_state = TCP_ESTABLISHED; - skb->sk_state = TCP_ESTABLISHED; - socka->state = SS_CONNECTED; - sockb->state = SS_CONNECTED; - } + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; return 0; } @@ -2128,11 +2145,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk) } } -static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags) { struct scm_cookie scm; - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; @@ -2235,6 +2252,53 @@ out: return err; } +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + +#ifdef CONFIG_BPF_SYSCALL + if (sk->sk_prot != &unix_proto) + return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif + return __unix_dgram_recvmsg(sk, msg, size, flags); +} + +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int used, err; + + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, 0, 1, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; + + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + kfree_skb(skb); + break; + } else if (used <= skb->len) { + copied += used; + } + + kfree_skb(skb); + if (!desc->count) + break; + } + + return copied; +} + /* * Sleep until more data has arrived. But check for races.. */ @@ -2972,6 +3036,7 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); + unix_bpf_build_proto(); out: return rc; } diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c new file mode 100644 index 000000000000..db0cda29fb2f --- /dev/null +++ b/net/unix/unix_bpf.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ + +#include <linux/skmsg.h> +#include <linux/bpf.h> +#include <net/sock.h> +#include <net/af_unix.h> + +#define unix_sk_has_data(__sk, __psock) \ + ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ + !skb_queue_empty(&__psock->ingress_skb) || \ + !list_empty(&__psock->ingress_msg); \ + }) + +static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct unix_sock *u = unix_sk(sk); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + if (!unix_sk_has_data(sk, psock)) { + mutex_unlock(&u->iolock); + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + mutex_lock(&u->iolock); + ret = unix_sk_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, + int *addr_len) +{ + struct unix_sock *u = unix_sk(sk); + struct sk_psock *psock; + int copied, ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return __unix_dgram_recvmsg(sk, msg, len, flags); + + mutex_lock(&u->iolock); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + ret = __unix_dgram_recvmsg(sk, msg, len, flags); + goto out; + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = unix_msg_wait_data(sk, psock, timeo); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + ret = __unix_dgram_recvmsg(sk, msg, len, flags); + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return ret; +} + +static struct proto *unix_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_prot_lock); +static struct proto unix_bpf_prot; + +static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_dgram_bpf_recvmsg; +} + +static void unix_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) { + spin_lock_bh(&unix_prot_lock); + if (likely(ops != unix_prot_saved)) { + unix_bpf_rebuild_protos(&unix_bpf_prot, ops); + smp_store_release(&unix_prot_saved, ops); + } + spin_unlock_bh(&unix_prot_lock); + } +} + +int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_bpf_prot); + return 0; +} + +void __init unix_bpf_build_proto(void) +{ + unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto); +} |